# first imports
import torch
from torch import nn  # model
from torch import optim  # optimizer
from torchvision import datasets, transforms  # data and data transforms
from torch.utils.data import random_split, DataLoader  # utilities

import numpy as np
import matplotlib.pyplot as plt

# directly from data
data = [[1, 2], [3, 4]]
x_data = torch.tensor(data)

# from numpy array
np_array = np.array(data)
x_np = torch.from_numpy(np_array)

# from another tensor
x_ones = torch.ones_like(x_data)  # retains the properties of x_data
print(f"Ones Tensor: \n {x_ones} \n")

x_rand = torch.rand_like(x_data, dtype=torch.float)  # overrides the datatype of x_data
print(f"Random Tensor: \n {x_rand} \n")

Ones Tensor: 
 tensor([[1, 1],
        [1, 1]]) 

Random Tensor: 
 tensor([[0.0827, 0.0419],
        [0.7667, 0.1075]])

# use tuples to determine tensor dimensions
shape = (
    2,
    3,
)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)

print(f"Random Tensor: \n {rand_tensor} \n")
print(f"Ones Tensor: \n {ones_tensor} \n")
print(f"Zeros Tensor: \n {zeros_tensor}")

Random Tensor: 
 tensor([[0.4602, 0.0292, 0.1300],
        [0.9500, 0.4250, 0.4984]]) 

Ones Tensor: 
 tensor([[1., 1., 1.],
        [1., 1., 1.]]) 

Zeros Tensor: 
 tensor([[0., 0., 0.],
        [0., 0., 0.]])

# tensor attributes
tensor = torch.rand(3, 4)

print(f"Shape of tensor: {tensor.shape}")
print(f"Datatype of tensor: {tensor.dtype}")
print(f"Device tensor is stored on: {tensor.device}")

Shape of tensor: torch.Size([3, 4])
Datatype of tensor: torch.float32
Device tensor is stored on: cpu

# by default, tensors are created on CPU
# We move our tensor to the GPU if available
if torch.cuda.is_available():
    tensor = tensor.to("cuda")

# indexing like numpy
tensor = torch.ones(4, 4)
print(f"First row: {tensor[0]}")
print(f"First column: {tensor[:, 0]}")
print(f"Last column: {tensor[..., -1]}")
tensor[:, 1] = 0
print(tensor)

First row: tensor([1., 1., 1., 1.])
First column: tensor([1., 1., 1., 1.])
Last column: tensor([1., 1., 1., 1.])
tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])

# joining tensors
t1 = torch.cat([tensor, tensor, tensor], dim=1)
print(t1)

tensor([[1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.]])

# This computes the matrix multiplication between two tensors. y1, y2, y3 will have the same value
y1 = tensor @ tensor.T
y2 = tensor.matmul(tensor.T)

y3 = torch.rand_like(y1)
torch.matmul(tensor, tensor.T, out=y3)


# This computes the element-wise product. z1, z2, z3 will have the same value
z1 = tensor * tensor
z2 = tensor.mul(tensor)

z3 = torch.rand_like(tensor)
torch.mul(tensor, tensor, out=z3)

tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])

# GPU via CUDA
# torch.randn(5).cuda()
# better (more flexible):
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
torch.randn(5).to(device)

tensor([ 0.1188,  0.5313,  0.4246, -0.3857,  0.2497])

# import and split data
train_data = datasets.MNIST(
    "data", train=True, download=True, transform=transforms.ToTensor()
)
train, val = random_split(train_data, [55000, 5000])
train_loader = DataLoader(train, batch_size=32)
val_loader = DataLoader(val, batch_size=32)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz

Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to data/MNIST/raw/train-images-idx3-ubyte.gz

  0%|          | 0.00/9.91M [00:00<?, ?B/s]

  1%|          | 98.3k/9.91M [00:00<00:11, 835kB/s]

  4%|▍         | 393k/9.91M [00:00<00:05, 1.68MB/s]

 16%|█▌        | 1.54M/9.91M [00:00<00:01, 4.91MB/s]

 63%|██████▎   | 6.29M/9.91M [00:00<00:00, 17.1MB/s]

100%|██████████| 9.91M/9.91M [00:00<00:00, 16.7MB/s]

figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
    sample_idx = torch.randint(len(train_data), size=(1,)).item()
    img, label = train_data[sample_idx]
    figure.add_subplot(rows, cols, i)
    plt.axis("off")
    plt.imshow(img.squeeze(), cmap="gray")
plt.show()

train_features, train_labels = next(iter(train_loader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0].squeeze()
label = train_labels[0]
plt.imshow(img, cmap="gray")
plt.show()
print(f"Label: {label}")

Feature batch shape: torch.Size([32, 1, 28, 28])
Labels batch shape: torch.Size([32])

Label: 1

# in theory easy via stateless approach
# import torch.nn.functional as F

# loss_func = F.cross_entropy

# def model(xb):
#    return xb @ weights + bias

# print(loss_func(model(xb), yb), accuracy(model(xb), yb))
# gets messy quickly!

# define model via explicit nn.Module class
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(28 * 28, 64)
        self.l2 = nn.Linear(64, 64)
        self.l3 = nn.Linear(64, 10)
        self.do = nn.Dropout(0.1)

    def forward(self, x):
        h1 = nn.functional.relu(self.l1(x))
        h2 = nn.functional.relu(self.l2(h1))
        do = self.do(h2 + h1)  # residual connection
        logits = self.l3(do)
        return logits

# defining model via sequential
# shorthand, no need for forward method
model_seq = nn.Sequential(
    nn.Linear(28 * 28, 64),
    nn.ReLU(),
    nn.Linear(64, 64),
    nn.ReLU(),
    nn.Dropout(0.1),  # often helps with overfitting
    nn.Linear(64, 10),
)

# move model to GPU/device memory
model = model_seq.to(device)

print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Model structure: Sequential(
  (0): Linear(in_features=784, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Dropout(p=0.1, inplace=False)
  (5): Linear(in_features=64, out_features=10, bias=True)
)


Layer: 0.weight | Size: torch.Size([64, 784]) | Values : tensor([[-0.0335, -0.0275, -0.0256,  ..., -0.0024, -0.0025, -0.0082],
        [-0.0282,  0.0290, -0.0272,  ...,  0.0337,  0.0293, -0.0080]],
       grad_fn=<SliceBackward0>) 

Layer: 0.bias | Size: torch.Size([64]) | Values : tensor([-0.0182, -0.0353], grad_fn=<SliceBackward0>) 

Layer: 2.weight | Size: torch.Size([64, 64]) | Values : tensor([[ 0.0715, -0.1126,  0.0390,  0.0498,  0.0816, -0.1036,  0.0281,  0.0934,
          0.0424, -0.0266, -0.0724, -0.0794,  0.0615, -0.0351, -0.0746,  0.0616,
          0.1029,  0.0948,  0.0223, -0.0240,  0.0318,  0.1078,  0.1055,  0.0615,
          0.0730,  0.0835,  0.1195,  0.0003, -0.0562, -0.0186,  0.0297, -0.0454,
         -0.0682, -0.0863, -0.0207, -0.0655,  0.0946, -0.1095, -0.0346,  0.0371,
         -0.0751,  0.0748,  0.1037,  0.0470,  0.0909, -0.0021, -0.1194, -0.0227,
         -0.1094, -0.0400,  0.0652, -0.1189, -0.0833, -0.0029, -0.0004, -0.0734,
          0.0996,  0.0848,  0.0965,  0.1241, -0.0569, -0.0789,  0.0117,  0.0789],
        [ 0.0340, -0.0876,  0.0633, -0.0206, -0.0047,  0.0773, -0.0778, -0.0203,
          0.0566, -0.0582, -0.1016, -0.0772, -0.1007,  0.0184, -0.0207, -0.0656,
         -0.0830,  0.0106, -0.0216,  0.0777,  0.0052,  0.0897,  0.0489,  0.0307,
          0.0397,  0.1021,  0.0071,  0.0095, -0.1222, -0.1043, -0.0147,  0.0663,
         -0.1184,  0.1028, -0.0670,  0.1143,  0.1211, -0.0361, -0.0554, -0.0995,
         -0.0342, -0.0597, -0.0353, -0.0675,  0.0191, -0.0159,  0.0858,  0.0541,
          0.0493, -0.0040, -0.0716, -0.0954, -0.1108,  0.0461, -0.0228,  0.0631,
         -0.0554, -0.1049, -0.0828, -0.0584,  0.0652,  0.1038, -0.0767,  0.1238]],
       grad_fn=<SliceBackward0>) 

Layer: 2.bias | Size: torch.Size([64]) | Values : tensor([-0.0041, -0.1017], grad_fn=<SliceBackward0>) 

Layer: 5.weight | Size: torch.Size([10, 64]) | Values : tensor([[ 0.0096, -0.0626, -0.1024, -0.0209,  0.1220,  0.0286, -0.1096, -0.0114,
          0.0685,  0.0858, -0.0367, -0.0651, -0.0348,  0.1013, -0.0068,  0.0232,
          0.0234, -0.0675,  0.1223,  0.0134, -0.0687,  0.1153,  0.1026,  0.1104,
          0.1032, -0.1083,  0.0074,  0.0140, -0.0285, -0.0779,  0.0843, -0.0011,
         -0.0616,  0.0779,  0.0533, -0.1199,  0.1070, -0.0909, -0.0236, -0.1135,
          0.0181, -0.0376,  0.1056,  0.0939, -0.0576,  0.0645, -0.0558, -0.1122,
          0.0972, -0.0893,  0.0284, -0.0208, -0.0121, -0.0289, -0.1005,  0.0838,
          0.0102,  0.0961,  0.0788,  0.0104, -0.0698,  0.0269,  0.1140,  0.1237],
        [ 0.0704,  0.0483,  0.0015,  0.0270, -0.0227,  0.0046, -0.0292,  0.0969,
          0.0972, -0.0825, -0.0353, -0.0913, -0.0933,  0.1159, -0.0116, -0.0378,
          0.1126, -0.0062, -0.0518, -0.0046,  0.0072, -0.0686, -0.0148,  0.1178,
         -0.0015,  0.0544,  0.0398, -0.0634,  0.0564, -0.0857,  0.0691,  0.0122,
          0.0652,  0.0253, -0.0398,  0.0602,  0.0380, -0.0466, -0.0786,  0.0494,
          0.0162,  0.0113,  0.1123,  0.0251,  0.0472,  0.0801,  0.0646, -0.0999,
          0.0660,  0.1169, -0.1098, -0.0710,  0.0662,  0.0536,  0.0427,  0.0819,
         -0.0363, -0.0819, -0.0406, -0.0665,  0.1065,  0.0902, -0.1064,  0.0702]],
       grad_fn=<SliceBackward0>) 

Layer: 5.bias | Size: torch.Size([10]) | Values : tensor([0.0361, 0.1211], grad_fn=<SliceBackward0>)

# define loss function
loss = nn.CrossEntropyLoss()  # softmax + neg. log

lr = 1e-2
epochs = 5

# defining optimizer
params = model.parameters()
optimiser = optim.SGD(params, lr=1e-2)

# define training and validation loop
# training loop
for epoch in range(epochs):
    losses = list()
    accuracies = list()
    model.train()  # enables dropout/batchnorm
    for batch in train_loader:
        x, y = batch
        batch_size = x.size(0)
        # x: b x 1 x 28 x 28
        x = x.view(batch_size, -1).to(device)

        # 5 steps to train network
        # 1 forward
        l = model(x)  # l: logits

        # 2 compute objective function
        J = loss(l, y.to(device))

        # 3 cleaning the gradients (could also call this on optimiser)
        model.zero_grad()
        # optimizer.zero_grad() is equivalent
        # manually: params.grad._zero()

        # 4 accumulate the partial derivatives of J wrt params
        J.backward()
        # manually: params.grad.add_(dJ/dparams)

        # 5 step in the opposite direction of the gradient
        optimiser.step()
        # could have done manual gradient update:
        # with torch.no_grad():
        #   params = params - lr * params.grad
        losses.append(J.item())
        accuracies.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())

    print(f"epoch {epoch + 1}", end=", ")
    print(f"training loss: {torch.tensor(losses).mean():.2f}", end=", ")
    print(
        f"training accuracy: {torch.tensor(accuracies).mean():.2f}"
    )  # print two decimals

    # validation loop
    losses = list()
    accuracies = list()
    model.eval()  # disables dropout/batchnorm
    for batch in val_loader:
        x, y = batch
        batch_size = x.size(0)
        # x: b x 1 x 28 x 28
        x = x.view(batch_size, -1).to(device)

        # 5 steps to train network
        # 1 forward
        with torch.no_grad():  # more efficient, just tensor no graph connected
            l = model(x)  # l: logits

        # 2 compute objective function
        J = loss(l, y.to(device))
        losses.append(J.item())
        accuracies.append(y.eq(l.detach().argmax(dim=1).cpu()).float().mean())

    print(f"epoch {epoch + 1}", end=", ")
    print(f"validation loss: {torch.tensor(losses).mean():.2f}", end=", ")
    print(
        f"validation accuracy: {torch.tensor(accuracies).mean():.2f}"
    )  # print two decimals

epoch 1, training loss: 1.28, training accuracy: 0.65

epoch 1, validation loss: 0.49, validation accuracy: 0.87

epoch 2, training loss: 0.43, training accuracy: 0.88

epoch 2, validation loss: 0.36, validation accuracy: 0.90

epoch 3, training loss: 0.35, training accuracy: 0.90

epoch 3, validation loss: 0.31, validation accuracy: 0.91

epoch 4, training loss: 0.30, training accuracy: 0.91

epoch 4, validation loss: 0.28, validation accuracy: 0.92

epoch 5, training loss: 0.27, training accuracy: 0.92

epoch 5, validation loss: 0.25, validation accuracy: 0.93

# just save model weights without structure
torch.save(model.state_dict(), "model_weights.pth")
model.load_state_dict(torch.load("model_weights.pth"))
model.eval()

/tmp/ipykernel_2066/2099193612.py:3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  model.load_state_dict(torch.load("model_weights.pth"))

Sequential(
  (0): Linear(in_features=784, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Dropout(p=0.1, inplace=False)
  (5): Linear(in_features=64, out_features=10, bias=True)
)

# save whole model
torch.save(model, "model.pth")
new_model = torch.load("model.pth")

/tmp/ipykernel_2066/2474383404.py:3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  new_model = torch.load("model.pth")

Lunch Time Python¶

28.10.2022: PyTorch¶

0 Why use PyTorch?¶

1 Tensors¶

2 Datasets and DataLoaders¶

3 Coding a neural network¶

4 Backpropagation via Autograd¶

5 Optimization of model parameters (training)¶

6 Store models¶

7 Material sources/more resources:¶