Class


Notes


Bigram Model:

words = open('.\\\\makemore\\\\names.txt', 'r').read().splitlines()

# use '.' as special token
N = torch.zeros((27, 27), dtype=torch.int32)

# get 'a' - 'z' chars
chars = sorted(list(set(''.join(words))))
# encode 1-26
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
# decode 'a'-'z'
itos = {i:s for s,i in stoi.items()}

# count bigram times
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1
    
# visualize
import matplotlib.pyplot as plt
# %matplotlib inline

plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off');

Untitled

One-layer MLP

# Together

import torch
# create the training set of bigrams (x, y)
words = open('.\\\\makemore\\\\names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        # print(f'{ch1, ch2}')
        xs.append(ix1)
        ys.append(ix2)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print(f"number of egs: {num}")

# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)
# W = zeros => probs = 1/27.0
# B = torch.randn((1, 27), generator=g, requires_grad=True)

# gradient decent
for k in range(100):
    # forward pass
    xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
    logits = xenc @ W # 2.5764
    # print(logits.shape)
    # logits = xenc @ W + B # 2.5226
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # get probs[0, 5] [1, 13] [2, 13] [3, 1] [4, 0]
    # loss = -probs[torch.arange(num), ys].log().mean() # 2.5764
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean() # 2.5108
    # W**2 is to control zero samples in `P=(N+1).float()`
    
    print(f"itr {k}, forward pass loss: {loss.item():.4f}")

    # backward pass
    W.grad = None # set to None the gradient from previous step. good practice.
    # B.grad = None
    loss.backward()
    # print(f"backward pass W.grad: {W.grad.shape}")

    # update 
    W.data += -50 * W.grad 
    # B.data += -10 * B.grad
    # print(f"update W: {W.data.shape}")


# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)
for flag in ["Bigram", "MLP"]:
    if flag == "Bigram":
        print("Bigram: ---------------")
    else:
        print("MLP: ----------------")
    
    for i in range(5):
        out = []
        ix = 0
        while True:
            # ----------
            # BEFORE:
            if flag == "Bigram":
                p = P[ix]
            # ----------
            # NOW:
            else:
                xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
                logits = xenc @ W # predict log-counts
                counts = logits.exp() # counts, equivalent to N
                p = counts / counts.sum(1, keepdims=True) # probabilities for next character
            # ----------

            ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
            out.append(itos[ix])
            if ix == 0:
                break

        print(''.join(out))