Hands on in-depth learning PyTorch version - Task02 learning notes

Text preprocessing

Text is a kind of sequential data. An article can be regarded as a sequence of characters or words
Pretreatment usually consists of four steps:

Read in text
English novels: Taking English novel as an example to show the steps of text preprocessing

import collections
import re

def read_time_machine():
    with open('/home/kesci/input/timemachine7163/timemachine.txt', 'r') as f:
        lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f]     #One line at a time. strip() function removes the white space character of Prefix suffix, lower() function converts upper case to lower case, re.sub is the replacement function of regular expression (replaces the substring of non English character with space)
    return lines  #lines is a list


lines = read_time_machine()
print('# sentences %d' % len(lines))

participle

def tokenize(sentences, token='word'):  #The parameter senses is a list, and each list element is a string sentence
#The parameter token is a flag to indicate which level of participle we want to make
    """Split sentences into word or char tokens"""
    if token == 'word':   #token='word 'means to do word level segmentation
        return [sentence.split(' ') for sentence in sentences]
        #Separate each sentence in sentence with a space as a separator
    elif token == 'char':   #token='char 'means to be a character level participle
        return [list(sentence) for sentence in sentences]
        #Convert strings directly to a list for each sentence in sentence
    else:
        print('ERROR: unkown token type '+token)

tokens = tokenize(lines)
tokens[0:2]
#The tokens function returns a two-dimensional list. The first dimension is each sentence in sentences, and the second dimension is the sequence of a word or character after each sentence is segmented

Build a dictionary to map each word to a unique index

class Vocab(object):  #The Vocab(object) class wants to map each word on the whole corpus to an index number uniquely. When we look up a word from Vocab, we can get the index number of the word we are looking up, and the index can get the word corresponding to the index
    def __init__(self, tokens, min_freq=0, use_special_tokens=False):   #Constructor
    	'''
    	//Parameters tokens -- the return result of function tokenize(), which is a two-dimensional list, is actually all the words in the corpus
    	//Parameter min_freq -- threshold value. When constructing a dictionary, the number of words in the corpus is very small. When the number of words is less than this threshold value, they are ignored
    	//Parameter use_special_tokens -- a flag to indicate whether to use some special tokens when building this dictionary
        '''
        counter = count_corpus(tokens)  #Statistical word frequency < key, value >: < word, word frequency >
        self.token_freqs = list(counter.items())
        #At this time, the words in the corpus are de duplicated and the word frequency is calculated
        self.idx_to_token = []  #The self.idx_to_token list is used to record the final token to be maintained in the corpus
        if use_special_tokens:
            # padding, begin of sentence, end of sentence, unknown
            '''
            pad-padding，Fill in some special things in the short sentences token pad，Make it consistent with the length of other sentences
            bos-begin of sentence，Add at the beginning of the sentence to indicate the beginning of the sentence
            eos-end of sentence，Add at the end of the sentence to indicate the end of the sentence
            unk-unknown，Unlisted words, words that never appear in the corpus
            '''
            self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
            self.idx_to_token += ['', '', '', '']
        else:
            self.unk = 0
            self.idx_to_token += ['']
        self.idx_to_token += [token for token, freq in self.token_freqs  #After processing the special token, add the words in the corpus to idx_to_token
                        if freq >= min_freq and token not in self.idx_to_token]  #If the word frequency > = threshold min_freq, and this word has not appeared in idx_to_token, then add the token to idx_to_token
        #idx_to_token is a list, so it is the mapping from token to index naturally, because the subscript of each word can be used as index
        self.token_to_idx = dict()   #Word to index mapping, defined as a dictionary
        for idx, token in enumerate(self.idx_to_token):  #Enumerate each word and its subscript in idx_to_token
            self.token_to_idx[token] = idx  #Add words and word subscripts to idx_to_token

    def __len__(self):   #Return dictionary size
        return len(self.idx_to_token)

    def __getitem__(self, tokens):    #The index of the Vocab class is defined. The parameter token can be 1. List or tuple, 2. String. Mapping from words to indexes
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)  #If the token is found, the token will be returned; otherwise, self.unk will be returned
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):   #Mapping from index to word
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

def count_corpus(sentences):  #The function of counting word frequency, senses is the preceding tokens, which is a two-dimensional list
    tokens = [tk for st in sentences for tk in st]  #Flatten senses to get a one-dimensional list
    return collections.Counter(tokens)  # Returns a dictionary that records the number of occurrences of each word

#Example: build dictionary with Time Machine as expectation
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[0:10])

The text is transformed from the sequence of words to the sequence of index, which is convenient for input model

#Convert word to index
for i in range(8, 10):
    print('words:', tokens[i])    #tokens[i] refers to the word sequence after the word segmentation in line I
    print('indices:', vocab[tokens[i]])   #Index vocab with tokens[i] to get the index number of each word

#Examples of word segmentation with existing tools
text = "Mr. Chen doesn't agree with my suggestion."

#spaCy
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
print([token.text for token in doc])

#NLTK
from nltk.tokenize import word_tokenize
from nltk import data
data.path.append('/home/kesci/input/nltk_data3784/nltk_data')
print(word_tokenize(text))

Learning notes

Build dictionary, set threshold
Special requirement token
1. Count \
2. Add or delete, using empty list
Pad: the length of two-dimensional matrix is different, and pad is used to supplement token in short sentence
bos: start token
eos: end token
Unk: unlisted word as unk
3. Word to index

To build a dictionary:
The main function of the dictionary is to map each word to a unique index number. It mainly constructs an idx_to_token list to store all the words, and a token_to_idx to store the indexes of all the words.
The implementation process is as follows:

The corpus is segmented to generate a token list, which contains the segmentation results of the corpus
Count the word frequency of the divided words, and then build a dictionary according to the word frequency (the counted word frequency completes the operation of de duplication, but also retains the word frequency for the convenience of subsequent operation)
Some of these nouns are used in the video
1. The function of pad is to use pad to fill in short samples (sentences) with different length when using batch sample training, so that the length of each sample is the same
BOS (begin of sentence) and eos(end of sentence) are used to indicate the beginning and end of a sentence
3. The function of unknow is to treat the words that never appear in the prediction library as unknow. In the code, some words with extremely low frequency can also be classified into this category

Language model and data set

Language model data set

#Read data set
with open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f:
    corpus_chars = f.read()
print(len(corpus_chars))
print(corpus_chars[: 40])
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
corpus_chars = corpus_chars[: 10000]

#Build character index
idx_to_char = list(set(corpus_chars)) # De duplicate to get index to character mapping
char_to_idx = {char: i for i, char in enumerate(idx_to_char)} # Character to index mapping
vocab_size = len(char_to_idx)
print(vocab_size)

corpus_indices = [char_to_idx[char] for char in corpus_chars]  # Turn each character into an index to get a sequence of indexes
sample = corpus_indices[: 20]
print('chars:', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

def load_data_jay_lyrics():
    with open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f:
        corpus_chars = f.read()
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    corpus_chars = corpus_chars[0:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    corpus_indices = [char_to_idx[char] for char in corpus_chars]
    return corpus_indices, char_to_idx, idx_to_char, vocab_size

Sampling of time series data: 1. Random sampling, 2. Adjacent sampling

#Random sampling
import torch
import random
def data_iter_random(corpus_indices, batch_size, num_steps, device=None):
    # Minus 1 is because for a sequence of length N, X contains at most the first n - 1 characters
    num_examples = (len(corpus_indices) - 1) // num_steps  # The number of samples without overlapping is obtained by rounding
    example_indices = [i * num_steps for i in range(num_examples)]  # The subscript of the first character of each sample in corpus  indexes
    random.shuffle(example_indices)

    def _data(i):
        # Returns a sequence of num steps from i
        return corpus_indices[i: i + num_steps]
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    for i in range(0, num_examples, batch_size):
        # Each time, select batch_size random samples
        batch_indices = example_indices[i: i + batch_size]  # Subscript of the first character of each sample of the current batch
        X = [_data(j) for j in batch_indices]
        Y = [_data(j + 1) for j in batch_indices]
        yield torch.tensor(X, device=device), torch.tensor(Y, device=device)


#Test random sampling
my_seq = list(range(30))
for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY:', Y, '\n')

#Adjacent sampling
def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    corpus_len = len(corpus_indices) // batch_size * batch_size  # The length of the remaining sequence
    corpus_indices = corpus_indices[: corpus_len]  # Only the first corpus'len characters are reserved
    indices = torch.tensor(corpus_indices, device=device)
    indices = indices.view(batch_size, -1)  # resize into (batch_size,)
    batch_num = (indices.shape[1] - 1) // num_steps
    for i in range(batch_num):
        i = i * num_steps
        X = indices[:, i: i + num_steps]
        Y = indices[:, i + 1: i + num_steps + 1]
        yield X, Y


#Test adjacent samples
for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6):
    print('X: ', X, '\nY:', Y, '\n')

Cyclic neural network from zero

import torch
import torch.nn as nn
import time
import math
import sys
sys.path.append("/home/kesci/input")
import d2l_jay9460 as d2l
(corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#One hot vector
    result = torch.zeros(x.shape[0], n_class, dtype=dtype, device=x.device)  # shape: (n, n_class)
    result.scatter_(1, x.long().view(-1, 1), 1)  # result[i, x[i, 0]] = 1
    return result
    
x = torch.tensor([0, 2])
x_one_hot = one_hot(x, vocab_size)
print(x_one_hot)
print(x_one_hot.shape)
print(x_one_hot.sum(axis=1))


def to_onehot(X, n_class):
    return [one_hot(X[:, i], n_class) for i in range(X.shape[1])]

X = torch.arange(10).view(2, 5)
inputs = to_onehot(X, vocab_size)
print(len(inputs), inputs[0].shape)

#Initialize model parameters
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size
# num_inputs: d
# Num? Hidden: H, the number of hidden cells is a super parameter
# num_outputs: q

def get_params():
    def _one(shape):
        param = torch.zeros(shape, device=device, dtype=torch.float32)
        nn.init.normal_(param, 0, 0.01)
        return torch.nn.Parameter(param)

    # Hide layer parameters
    W_xh = _one((num_inputs, num_hiddens))
    W_hh = _one((num_hiddens, num_hiddens))
    b_h = torch.nn.Parameter(torch.zeros(num_hiddens, device=device))
    # Output layer parameters
    W_hq = _one((num_hiddens, num_outputs))
    b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device))
    return (W_xh, W_hh, b_h, W_hq, b_q)

#Definition model
#Function rnn completes the calculation of each time step of the recurrent neural network in a cyclic way.

def rnn(inputs, state, params):
    # Input and output are both num steps and matrix with shape (batch size, vocab size)
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    for X in inputs:
        H = torch.tanh(torch.matmul(X, W_xh) + torch.matmul(H, W_hh) + b_h)
        Y = torch.matmul(H, W_hq) + b_q
        outputs.append(Y)
    return outputs, (H,)


#The init RNN state function initializes the hidden variable, where the return value is a tuple.

def init_rnn_state(batch_size, num_hiddens, device):
    return (torch.zeros((batch_size, num_hiddens), device=device), )


#Simple test
print(num_hiddens)
print(vocab_size)
state = init_rnn_state(X.shape[0], num_hiddens, device)
inputs = to_onehot(X.to(device), vocab_size)
params = get_params()
outputs, state_new = rnn(inputs, state, params)
print(len(inputs), inputs[0].shape)
print(len(outputs), outputs[0].shape)
print(len(state), state[0].shape)
print(len(state_new), state_new[0].shape)

#Tailoring gradient
def grad_clipping(params, theta, device):
    norm = torch.tensor([0.0], device=device)
    for param in params:
        norm += (param.grad.data ** 2).sum()
    norm = norm.sqrt().item()
    if norm > theta:
        for param in params:
            param.grad.data *= (theta / norm)

#Define prediction function
def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state,
                num_hiddens, vocab_size, device, idx_to_char, char_to_idx):
    state = init_rnn_state(1, num_hiddens, device)
    output = [char_to_idx[prefix[0]]]   # output record prefix plus predicted num ﹐ chars characters
    for t in range(num_chars + len(prefix) - 1):
        # Take the output of the previous time step as the input of the current time step
        X = to_onehot(torch.tensor([[output[-1]]], device=device), vocab_size)
        # Calculate output and update hidden state
        (Y, state) = rnn(X, state, params)
        # The next time step is to input the characters in the prefix or the current best prediction character
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(Y[0].argmax(dim=1).item())
    return ''.join([idx_to_char[i] for i in output])


#Test prediction function
predict_rnn('Separate', 10, rnn, params, init_rnn_state, num_hiddens, vocab_size,
            device, idx_to_char, char_to_idx)

#Define model training function
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, device, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params()
    loss = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:  # If adjacent sampling is used, the hidden state is initialized at the beginning of epoch
            state = init_rnn_state(batch_size, num_hiddens, device)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device)
        for X, Y in data_iter:
            if is_random_iter:  # If random sampling is used, the hidden state is initialized before each small batch update
                state = init_rnn_state(batch_size, num_hiddens, device)
            else:  # Otherwise, you need to use the detach function to separate the hidden state from the calculation graph
                for s in state:
                    s.detach_()
            # Input is a matrix whose shapes are (batch size, vocab size)
            inputs = to_onehot(X, vocab_size)
            # outputs have num steps matrices with the shape (batch size, vocab size)
            (outputs, state) = rnn(inputs, state, params)
            # After splicing, the shape is (Num ﹐ steps * batch ﹐ size, vocab ﹐ size)
            outputs = torch.cat(outputs, dim=0)
            # The shape of Y is (batch_size, num_steps), which is transformed to
            # The vector of (Num ﹐ steps * batch ﹐ size,) so that it corresponds to the output line one by one
            y = torch.flatten(Y.T)
            # Using cross entropy loss to calculate average classification error
            l = loss(outputs, y.long())
            
            # Gradient Qing 0
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            l.backward()
            grad_clipping(params, clipping_theta, device)  # Clipping gradient
            d2l.sgd(params, lr, 1)  # Because the error has been averaged, the gradient does not need to be averaged
            l_sum += l.item() * y.shape[0]
            n += y.shape[0]

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, device, idx_to_char, char_to_idx))

#Train models and create lyrics
num_epochs, num_steps, batch_size, lr, clipping_theta = 250, 35, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['Separate', 'No separation']

#Using random sampling training model and creating lyrics.

train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                      vocab_size, device, corpus_indices, idx_to_char,
                      char_to_idx, True, num_epochs, num_steps, lr,
                      clipping_theta, batch_size, pred_period, pred_len,
                      prefixes)


#The model was trained by adjacent sampling and lyrics were created.

train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                      vocab_size, device, corpus_indices, idx_to_char,
                      char_to_idx, False, num_epochs, num_steps, lr,
                      clipping_theta, batch_size, pred_period, pred_len,
                      prefixes)

Simple realization of cyclic neural network

#Definition model
class RNNModel(nn.Module):
    def __init__(self, rnn_layer, vocab_size):
        super(RNNModel, self).__init__()
        self.rnn = rnn_layer
        self.hidden_size = rnn_layer.hidden_size * (2 if rnn_layer.bidirectional else 1) 
        self.vocab_size = vocab_size
        self.dense = nn.Linear(self.hidden_size, vocab_size)

    def forward(self, inputs, state):
        # inputs.shape: (batch_size, num_steps)
        X = to_onehot(inputs, vocab_size)
        X = torch.stack(X)  # X.shape: (num_steps, batch_size, vocab_size)
        hiddens, state = self.rnn(X, state)
        hiddens = hiddens.view(-1, hiddens.shape[-1])  # hiddens.shape: (num_steps * batch_size, hidden_size)
        output = self.dense(hiddens)
        return output, state

#Define prediction function
def predict_rnn_pytorch(prefix, num_chars, model, vocab_size, device, idx_to_char,
                      char_to_idx):
    state = None
    output = [char_to_idx[prefix[0]]]  # output record prefix plus predicted num ﹐ chars characters
    for t in range(num_chars + len(prefix) - 1):
        X = torch.tensor([output[-1]], device=device).view(1, 1)
        (Y, state) = model(X, state)  # Forward calculation does not need to pass in model parameters
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(Y.argmax(dim=1).item())
    return ''.join([idx_to_char[i] for i in output])

#Use a model with a random weight to predict once.
model = RNNModel(rnn_layer, vocab_size).to(device)
predict_rnn_pytorch('Separate', 10, model, vocab_size, device, idx_to_char, char_to_idx)

#Using adjacent sampling to realize training function

def train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device,
                                corpus_indices, idx_to_char, char_to_idx,
                                num_epochs, num_steps, lr, clipping_theta,
                                batch_size, pred_period, pred_len, prefixes):
    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model.to(device)
    for epoch in range(num_epochs):
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = d2l.data_iter_consecutive(corpus_indices, batch_size, num_steps, device) # Adjacent sampling
        state = None
        for X, Y in data_iter:
            if state is not None:
                # Use detach function to separate hidden state from calculation graph
                if isinstance (state, tuple): # LSTM, state:(h, c)  
                    state[0].detach_()
                    state[1].detach_()
                else: 
                    state.detach_()
            (output, state) = model(X, state) # output.shape: (num_steps * batch_size, vocab_size)
            y = torch.flatten(Y.T)
            l = loss(output, y.long())
            
            optimizer.zero_grad()
            l.backward()
            grad_clipping(model.parameters(), clipping_theta, device)
            optimizer.step()
            l_sum += l.item() * y.shape[0]
            n += y.shape[0]
        

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn_pytorch(
                    prefix, pred_len, model, vocab_size, device, idx_to_char,
                    char_to_idx))

#Training model
num_epochs, batch_size, lr, clipping_theta = 250, 32, 1e-3, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['Separate', 'No separation']
train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device,
                            corpus_indices, idx_to_char, char_to_idx,
                            num_epochs, num_steps, lr, clipping_theta,
                            batch_size, pred_period, pred_len, prefixes)

Potatoes and potatoes

Published 0 original articles, praised 0, visited 18

Private letter follow

Posted by miksel on Fri, 14 Feb 2020 05:22:13 -0800

Programmer Group