See this tutorial.
Set requires_grad=True for all tensors we compute gradients w.r.t. them.
[code lang='python']
input and output
x = torch.randn(N, D_in, device=device, dtype=dtype) y = torch.randn(N, D_out, device=device, dtype=dtype)
parameters to be estimated
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True) w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
learning_rate = 1e-6 for t in range(500): # forward process y_pred = x.mm(w1).clamp(min=0).mm(w2)
# compute loss
loss = (y_pred - y).pow(2).sum()
# backward process
loss.backward()
# update the weights
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
w1.grad.zero_()
w2.grad.zero_()
[/code]