type AdamW struct { LearningRate float32 MeanFactor float32 VarianceFactor float32 Epsilon float32 WeightDecay float32 Schedule func(int) float32 tick int meanPower float32 meanRatio float32 variancePower float32 varianceRecip float32 weightDecay float32 } func (aw *AdamW) Before() { if aw.tick++; aw.tick == 1 { aw.meanPower = 1 aw.variancePower = 1 } sched := aw.Schedule(aw.tick) aw.meanPower *= aw.MeanFactor aw.meanRatio = sched * aw.LearningRate / (1 - aw.meanPower) aw.variancePower *= aw.VarianceFactor aw.varianceRecip = 1 / (1 - aw.variancePower) aw.weightDecay = sched * aw.WeightDecay } func (aw *AdamW) Update(weight, gradient, mean, variance []float32) { for i, g := range gradient { m := aw.MeanFactor * mean[i] m += (1 - aw.MeanFactor) * g v := aw.VarianceFactor * variance[i] v += (1 - aw.VarianceFactor) * g * g mean[i] = m variance[i] = v m *= aw.meanRatio v *= aw.varianceRecip w := aw.weightDecay * weight[i] sd := float32(math.Sqrt(float64(v))) weight[i] = w - m/(sd+aw.Epsilon) } }
To receive a hint, submit unfixed code.