Lecture 2: The spelled-out intro to language modeling: building makemore

Class

We implement a bigram character-level language model, which we will further complexify in followup videos into a modern Transformer language model, like GPT. In this video, the focus is on (1) introducing torch.Tensor and its subtleties and use in efficiently evaluating neural networks and (2) the overall framework of language modeling that includes model training, sampling, and the evaluation of a loss (e.g. the negative log likelihood for classification).

Notes

Bigram Model:

words = open('.\\\\makemore\\\\names.txt', 'r').read().splitlines()

# use '.' as special token
N = torch.zeros((27, 27), dtype=torch.int32)

# get 'a' - 'z' chars
chars = sorted(list(set(''.join(words))))
# encode 1-26
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
# decode 'a'-'z'
itos = {i:s for s,i in stoi.items()}

# count bigram times
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1
    
# visualize
import matplotlib.pyplot as plt
# %matplotlib inline

plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off');

Untitled

Generate words by Bigrams Model


g = torch.Generator().manual_seed(2147483647)

for i in range(10):
    out = []
    ix = 0 # start '.'
    while True:
        # train model have more name like
        p = N[ix].float()
        p /= p.sum()
        # compare to random distribute
        # p = torch.ones(27) / 27.0
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        # print(itos[ix])
        out.append(itos[ix])
        if ix == 0: break # end with '.'

    print(''.join(out)

junide. janasah. p. cony. a. nn. kohin. tolian. juee. ksahnaauranilevias.

# more efficient
g = torch.Generator().manual_seed(2147483647)
P = N.float()
P /= P.sum(1, keepdim=True) # sum by rows
## torch broadcasting <https://pytorch.org/docs/stable/notes/broadcasting.html>
# [27, 27] /= [27, 1] -> [27, 27]
# >>> x=torch.empty(5,1,4,1)
# >>> y=torch.empty(  3,1,1)
# >>> (x+y).size()
# torch.Size([5, 3, 4, 1])

for i in range(10):
    out = []
    ix = 0 # start '.'
    while True:
        p = P[ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        # print(itos[ix])
        out.append(itos[ix])
        if ix == 0: break # end with '.'

    print(''.join(out))

GOAL:
1. maximize likelihood of the data w.r.t. model parameters (statistical modeling)
  - equivalent to maximizing the log likelihood (because log is monotonic)
  - equivalent to minimizing the negative log likelihood
  - equivalent to minimizing the average negative log likelihood

# Eval quality: Loss

# log(a*b*c) = log(a) + log(b) + log(c)

log_likelihood = 0.0
n = 0
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        log_prob = torch.log(prob)
        log_likelihood += log_prob
        n += 1
        # print(f'{ch1}{ch2}: {prob:.4f}, {log_prob:.4f}')
        
nll = -log_likelihood
print(f'{log_likelihood=}')
print(f'{nll=}')
print(f'{nll/n=}')

nll/n=tensor(2.4541)

# problem with unseen 'jq'
# avoid inf loss

g = torch.Generator().manual_seed(2147483647)
P = (N+1).float() # avoid inf loss
P /= P.sum(1, keepdim=True) # sum by rows

log_likelihood = 0.0
n = 0
for w in ['andjq']:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        log_prob = torch.log(prob)
        log_likelihood += log_prob
        n += 1
        print(f'{ch1}{ch2}: {prob:.4f}, {log_prob:.4f}')
        
nll = -log_likelihood
print(f'{log_likelihood=}')
print(f'{nll=}')
print(f'{nll/n=}')

.a: 0.1376, -1.9835 an: 0.1604, -1.8302 nd: 0.0384, -3.2594 dj: 0.0018, -6.3141 jq: 0.0003, -7.9817 q.: 0.0970, -2.3331 log_likelihood=tensor(-23.7021) nll=tensor(23.7021) nll/n=tensor(3.9503)

One-layer MLP

# Together

import torch
# create the training set of bigrams (x, y)
words = open('.\\\\makemore\\\\names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        # print(f'{ch1, ch2}')
        xs.append(ix1)
        ys.append(ix2)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print(f"number of egs: {num}")

# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)
# W = zeros => probs = 1/27.0
# B = torch.randn((1, 27), generator=g, requires_grad=True)

number of egs: 228146

# gradient decent
for k in range(100):
    # forward pass
    xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
    logits = xenc @ W # 2.5764
    # print(logits.shape)
    # logits = xenc @ W + B # 2.5226
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # get probs[0, 5] [1, 13] [2, 13] [3, 1] [4, 0]
    # loss = -probs[torch.arange(num), ys].log().mean() # 2.5764
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean() # 2.5108
    # W**2 is to control zero samples in `P=(N+1).float()`
    
    print(f"itr {k}, forward pass loss: {loss.item():.4f}")

    # backward pass
    W.grad = None # set to None the gradient from previous step. good practice.
    # B.grad = None
    loss.backward()
    # print(f"backward pass W.grad: {W.grad.shape}")

    # update 
    W.data += -50 * W.grad 
    # B.data += -10 * B.grad
    # print(f"update W: {W.data.shape}")

itr 0, forward pass loss: 2.5213 itr 1, forward pass loss: 2.5199 itr 2, forward pass loss: 2.5185 itr 3, forward pass loss: 2.5172 itr 4, forward pass loss: 2.5160 itr 5, forward pass loss: 2.5149 ... itr 96, forward pass loss: 2.4859 itr 97, forward pass loss: 2.4858 itr 98, forward pass loss: 2.4858 itr 99, forward pass loss: 2.4857


# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)
for flag in ["Bigram", "MLP"]:
    if flag == "Bigram":
        print("Bigram: ---------------")
    else:
        print("MLP: ----------------")
    
    for i in range(5):
        out = []
        ix = 0
        while True:
            # ----------
            # BEFORE:
            if flag == "Bigram":
                p = P[ix]
            # ----------
            # NOW:
            else:
                xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
                logits = xenc @ W # predict log-counts
                counts = logits.exp() # counts, equivalent to N
                p = counts / counts.sum(1, keepdims=True) # probabilities for next character
            # ----------

            ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
            out.append(itos[ix])
            if ix == 0:
                break

        print(''.join(out))

Bigram: --------------- junide. janasah. p. cony. a. MLP: ---------------- nn. kohin. tolian. juwe. ksahnaauranilevias.