API Overview
torch.tensor behaves like numpy.array
type generalization
Python
Copy
>> torch.tensor([False, 1, 2.0])
tensor([0., 1., 2.])
random
Python
Copy
torch.randint(low=0, high=10, size=(3,4))
torch.rand((3,4))
torch.randn((3,4))
adding and multiplying scalar
Python
Copy
x = torch.tensor([[1,2,3,4], [5,6,7,8]])
print(x * 10)
y = x.add(10)
print(y)
reshapping
Python
Copy
y = torch.tensor([2, 3, 1, 0])
y = y.view(4,1)
squeeze can also be used for reshapping if the dimension to remove has only one item
Python
Copy
x = torch.randn(10,1,10)
z1 = torch.squeeze(x, 1)
#or
z2 = x.squeeze(1)
opposite of squeeze is unsqueeze
Python
Copy
x = torch.randn(10,10)
z1 = x.unsqueeze(0)
print(z1.shape)
# torch.size(1,10,10)
the same can be obtained by adding fake dim
Python
Copy
z2, z3, z4 = x[None], x[:,None], x[:,:,None]
print(z2.shape, z3.shape, z4.shape)
# torch.Size([1, 10, 10])
# torch.Size([10, 1, 10])
# torch.Size([10, 10, 1])
Matrix multiply
Python
Copy
torch.matmul(x, y)
## or
x @ y
Concat
Python
Copy
x = torch.randn(10, 10, 10)
z = torch.cat([x, x], axis=0)
print('Cat axis 0:', x.shape, z.shape)
# Cat axis 0:
# torch.Size([10, 10, 10])
# torch.Size([20, 10, 10])
Max
Python
Copy
x = torch.arange(25).reshape(5,5)
print(x.max())
# tensor(24)
x.max(dim=0)
# torch.return_types.max(values=tensor([20, 21, 22, 23, 24]),
# indices=tensor([4, 4, 4, 4, 4]))
x.max(dim=1)
# tensor([ 4, 9, 14, 19, 24])
# tensor([4, 4, 4, 4, 4])
Permute
always use permute over view to swap
Python
Copy
x = torch.randn(10,20,30)
z = x.permute(2,0,1)
print('Permute dimensions:', x.shape, z.shape)
# Permute dimensions:
# torch.Size([10, 20, 30])
# torch.Size([30, 10, 20])
Auto gradient of tensor objects
The requires_grad parameter specifies that the gradient is to be calculated for the tensor object
Python
Copy
x = torch.tensor([[2., -1.], [1., 1.]], requires_grad=True)
Defining our function. Its gradient is 2 * x
Python
Copy
out = x.pow(2).sum()
compute the gradient
Python
Copy
out.backward()
We are now in a position to obtain the gradient of out with respect to x
Python
Copy
out.grad
# tensor([[4., 2.],
# [2., 2.]])
Torch vs Numpy
Register tensor objects, aka storing information in a device
Python
Copy
x = torch.rand(1, 6400)
y = torch.rand(6400, 5000)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
x, y = x.to(device), y.to(device)
GPU matmul: fast
Python
Copy
%timeit z = (x @ y)
# It takes 0.515 milli seconds on an average to
# perform matrix multiplication
CPU matmul: 20x slower
Python
Copy
x, y = x.cpu(), y.cpu()
%timeit z = (x @ y)
# It takes 9 milli seconds on an average to
# perform matrix multiplication
Numpy CPU matmul: 40x slower
Python
Copy
x = np.random.random((1, 6400))
y = np.random.random((6400, 5000))
%timeit z = np.matmul(x, y)
# It takes 19 milli seconds on an average to
# perform matrix multiplication
Building a neural net
Define input and output as
Python
Copy
x = [[1, 2], [3, 4], [5, 6], [7, 8]]
y = [[3], [7], [11], [15]]
device = "cuda" if torch.cuda.is_available() else "cpu"
X = torch.tensor(x).float().to(device)
Y = torch.tensor(y).float().to(device)
Define module as
must call super in __init__ and overwrite forward
Python
Copy
import torch.nn as nn
class MyModule(nn.Module):
def __init__(self):
super().__init__()
self.input_to_hidden = nn.Linear(2, 8)
self.activation = nn.ReLU()
self.hidden_to_class = nn.Linear(8, 1)
def forward(self, x):
x = self.input_to_hidden(x)
x = self.activation(x)
x = self.hidden_to_class(x)
return x
Instantiate and check weights
Python
Copy
my_module = MyModule().to(device)
my_module.input_to_hidden.weight
#Parameter containing:
#tensor([[-0.1474, 0.0559],
# [ 0.0454, 0.3088],
# [-0.1240, -0.5073],
# [-0.3796, 0.6139],
# [ 0.0052, -0.6587],
# [ 0.0177, 0.5601],
# [-0.0376, -0.6641],
# [-0.6778, 0.6188]], requires_grad=True)
mymodule.parameters()
# all parameters
Parameter can replace any tensor creator in __init__
Python
Copy
self.input_to_hidden = nn.Parameter(torch.rand(2,8))
# and in forward
x = x @ self.input_to_hidden
Define loss function that we optimize for
PyTorch convention: in the loss function, always put the prediction first and then the ground truth
Python
Copy
loss_func = nn.MSELoss()
# also popular
# nn.CrossEntropyLoss, for categorical
# nn.BCELoss, for binary
Y_hat = mymodule(X)
print(loss_func(Y_hat, Y))
Custom loss can also be defined
Python
Copy
def my_MAELoss(y_hat, y):
loss = (y_hat - y).abs()
loss = loss.mean()
return loss
Define optimizer
Python
Copy
from torch.optim import SGD
opt = SGD(mymodule.parameters(), lr=.001)
Wrapup
reset gradient at each epoch with zero_grad
Python
Copy
mymodule = MyModule()
opt = nn.SGD(mymodule.parameters(), lr=.001)
loss_history = []
for idx in range(n_epoch):
opt.zero_grad()
Y_hat = mymodule(X)
loss_value = loss_func(Y_hat, Y)
loss_value.backward()
opt.step()
loss_history.append(loss_value)
Dataset, DataLoader, and batch size
Batch size is the number of point we consider to compute the loss function
representative enough of the dataset, and memory efficient
Implement dataset class
Python
Copy
import torch
import torch.nn as nn
from torch.utils.data import DataSet, DataLoader
class MyDataSet(DataSet):
def __init__(self, x, y):
self.X = torch.tensor(x).float()
self.Y = torch.tensor(y).float()
def __getitem__(self, idx):
return self.X[idx], self.Y[idx]
def __len__(self):
return len(self.X)
Use it with dataloader
Python
Copy
ds = MyDataSet(x, y)
dl = DataLoader(ds, batch_size=2, shuffle=True)
for _ in range(n_epoch):
for x, y in dl:
...
Inference
Simple as
Python
Copy
x_val = [[12, 13]]
X_val = torch.tensor(x_val).float().to(device)
mymodule(X_val)
Sequential model
Define the whole model in a single shot
Python
Copy
model = nn.Sequential(
nn.Linear(8, 2),
nn.ReLu(),
nn.Linear(8, 1)
).to(device)
Summary of the model, along with its input size
Python
Copy
!pip install torch_summary
from torchsummary import summary
summary(model, torch.zeros(1, 2))
Saving and loading model
Model descriptor
Python
Copy
model.state_dict()
>> OrderedDict([
('input_to_hidden.weight', tensor(..)),
('input_to_hidden.bias'),
...
])
Save using CPU, even if Cuda is available, as it will help with loading
Python
Copy
torch.save(model.to("cpu").state_dict(), "mymodel.pth")
Load
Create an empty model with the same structure
Python
Copy
model = nn.Sequential(
nn.Linear(2, 8),
nn.ReLU(),
nn.Linear(8, 1)
).to(device)
Load the model from disk
Python
Copy
state_dict = torch.load("mymodel.pth")
Load state_dict onto the model
Python
Copy
model.load_state_dict(state_dict)
model.to(device)
model(torch.tensor(val).float().to(device))