words = open('.\\\\makemore\\\\names.txt', 'r').read().splitlines()
# use '.' as special token
N = torch.zeros((27, 27), dtype=torch.int32)
# get 'a' - 'z' chars
chars = sorted(list(set(''.join(words))))
# encode 1-26
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
# decode 'a'-'z'
itos = {i:s for s,i in stoi.items()}
# count bigram times
for w in words:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
N[ix1, ix2] += 1
# visualize
import matplotlib.pyplot as plt
# %matplotlib inline
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
for j in range(27):
chstr = itos[i] + itos[j]
plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off');
Generate words by Bigrams Model
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
out = []
ix = 0 # start '.'
while True:
# train model have more name like
p = N[ix].float()
p /= p.sum()
# compare to random distribute
# p = torch.ones(27) / 27.0
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
# print(itos[ix])
out.append(itos[ix])
if ix == 0: break # end with '.'
print(''.join(out)
# more efficient
g = torch.Generator().manual_seed(2147483647)
P = N.float()
P /= P.sum(1, keepdim=True) # sum by rows
## torch broadcasting <https://pytorch.org/docs/stable/notes/broadcasting.html>
# [27, 27] /= [27, 1] -> [27, 27]
# >>> x=torch.empty(5,1,4,1)
# >>> y=torch.empty( 3,1,1)
# >>> (x+y).size()
# torch.Size([5, 3, 4, 1])
for i in range(10):
out = []
ix = 0 # start '.'
while True:
p = P[ix]
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
# print(itos[ix])
out.append(itos[ix])
if ix == 0: break # end with '.'
print(''.join(out))
# Eval quality: Loss
# log(a*b*c) = log(a) + log(b) + log(c)
log_likelihood = 0.0
n = 0
for w in words:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
prob = P[ix1, ix2]
log_prob = torch.log(prob)
log_likelihood += log_prob
n += 1
# print(f'{ch1}{ch2}: {prob:.4f}, {log_prob:.4f}')
nll = -log_likelihood
print(f'{log_likelihood=}')
print(f'{nll=}')
print(f'{nll/n=}')
# problem with unseen 'jq'
# avoid inf loss
g = torch.Generator().manual_seed(2147483647)
P = (N+1).float() # avoid inf loss
P /= P.sum(1, keepdim=True) # sum by rows
log_likelihood = 0.0
n = 0
for w in ['andjq']:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
prob = P[ix1, ix2]
log_prob = torch.log(prob)
log_likelihood += log_prob
n += 1
print(f'{ch1}{ch2}: {prob:.4f}, {log_prob:.4f}')
nll = -log_likelihood
print(f'{log_likelihood=}')
print(f'{nll=}')
print(f'{nll/n=}')
# Together
import torch
# create the training set of bigrams (x, y)
words = open('.\\\\makemore\\\\names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
xs, ys = [], []
for w in words:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
# print(f'{ch1, ch2}')
xs.append(ix1)
ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print(f"number of egs: {num}")
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)
# W = zeros => probs = 1/27.0
# B = torch.randn((1, 27), generator=g, requires_grad=True)
# gradient decent
for k in range(100):
# forward pass
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # 2.5764
# print(logits.shape)
# logits = xenc @ W + B # 2.5226
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# get probs[0, 5] [1, 13] [2, 13] [3, 1] [4, 0]
# loss = -probs[torch.arange(num), ys].log().mean() # 2.5764
loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean() # 2.5108
# W**2 is to control zero samples in `P=(N+1).float()`
print(f"itr {k}, forward pass loss: {loss.item():.4f}")
# backward pass
W.grad = None # set to None the gradient from previous step. good practice.
# B.grad = None
loss.backward()
# print(f"backward pass W.grad: {W.grad.shape}")
# update
W.data += -50 * W.grad
# B.data += -10 * B.grad
# print(f"update W: {W.data.shape}")
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)
for flag in ["Bigram", "MLP"]:
if flag == "Bigram":
print("Bigram: ---------------")
else:
print("MLP: ----------------")
for i in range(5):
out = []
ix = 0
while True:
# ----------
# BEFORE:
if flag == "Bigram":
p = P[ix]
# ----------
# NOW:
else:
xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
p = counts / counts.sum(1, keepdims=True) # probabilities for next character
# ----------
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
out.append(itos[ix])
if ix == 0:
break
print(''.join(out))