Text preprocessing
Text is a kind of sequential data. An article can be regarded as a sequence of characters or words
Pretreatment usually consists of four steps:
- Read in text
English novels: Taking English novel as an example to show the steps of text preprocessing
import collections import re def read_time_machine(): with open('/home/kesci/input/timemachine7163/timemachine.txt', 'r') as f: lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f] #One line at a time. strip() function removes the white space character of Prefix suffix, lower() function converts upper case to lower case, re.sub is the replacement function of regular expression (replaces the substring of non English character with space) return lines #lines is a list lines = read_time_machine() print('# sentences %d' % len(lines))
- participle
def tokenize(sentences, token='word'): #The parameter senses is a list, and each list element is a string sentence #The parameter token is a flag to indicate which level of participle we want to make """Split sentences into word or char tokens""" if token == 'word': #token='word 'means to do word level segmentation return [sentence.split(' ') for sentence in sentences] #Separate each sentence in sentence with a space as a separator elif token == 'char': #token='char 'means to be a character level participle return [list(sentence) for sentence in sentences] #Convert strings directly to a list for each sentence in sentence else: print('ERROR: unkown token type '+token) tokens = tokenize(lines) tokens[0:2] #The tokens function returns a two-dimensional list. The first dimension is each sentence in sentences, and the second dimension is the sequence of a word or character after each sentence is segmented
- Build a dictionary to map each word to a unique index
class Vocab(object): #The Vocab(object) class wants to map each word on the whole corpus to an index number uniquely. When we look up a word from Vocab, we can get the index number of the word we are looking up, and the index can get the word corresponding to the index def __init__(self, tokens, min_freq=0, use_special_tokens=False): #Constructor ''' //Parameters tokens -- the return result of function tokenize(), which is a two-dimensional list, is actually all the words in the corpus //Parameter min_freq -- threshold value. When constructing a dictionary, the number of words in the corpus is very small. When the number of words is less than this threshold value, they are ignored //Parameter use_special_tokens -- a flag to indicate whether to use some special tokens when building this dictionary ''' counter = count_corpus(tokens) #Statistical word frequency < key, value >: < word, word frequency > self.token_freqs = list(counter.items()) #At this time, the words in the corpus are de duplicated and the word frequency is calculated self.idx_to_token = [] #The self.idx_to_token list is used to record the final token to be maintained in the corpus if use_special_tokens: # padding, begin of sentence, end of sentence, unknown ''' pad-padding,Fill in some special things in the short sentences token pad,Make it consistent with the length of other sentences bos-begin of sentence,Add at the beginning of the sentence to indicate the beginning of the sentence eos-end of sentence,Add at the end of the sentence to indicate the end of the sentence unk-unknown,Unlisted words, words that never appear in the corpus ''' self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3) self.idx_to_token += ['', '', '', ''] else: self.unk = 0 self.idx_to_token += [''] self.idx_to_token += [token for token, freq in self.token_freqs #After processing the special token, add the words in the corpus to idx_to_token if freq >= min_freq and token not in self.idx_to_token] #If the word frequency > = threshold min_freq, and this word has not appeared in idx_to_token, then add the token to idx_to_token #idx_to_token is a list, so it is the mapping from token to index naturally, because the subscript of each word can be used as index self.token_to_idx = dict() #Word to index mapping, defined as a dictionary for idx, token in enumerate(self.idx_to_token): #Enumerate each word and its subscript in idx_to_token self.token_to_idx[token] = idx #Add words and word subscripts to idx_to_token def __len__(self): #Return dictionary size return len(self.idx_to_token) def __getitem__(self, tokens): #The index of the Vocab class is defined. The parameter token can be 1. List or tuple, 2. String. Mapping from words to indexes if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) #If the token is found, the token will be returned; otherwise, self.unk will be returned return [self.__getitem__(token) for token in tokens] def to_tokens(self, indices): #Mapping from index to word if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices] def count_corpus(sentences): #The function of counting word frequency, senses is the preceding tokens, which is a two-dimensional list tokens = [tk for st in sentences for tk in st] #Flatten senses to get a one-dimensional list return collections.Counter(tokens) # Returns a dictionary that records the number of occurrences of each word
#Example: build dictionary with Time Machine as expectation vocab = Vocab(tokens) print(list(vocab.token_to_idx.items())[0:10])
- The text is transformed from the sequence of words to the sequence of index, which is convenient for input model
#Convert word to index for i in range(8, 10): print('words:', tokens[i]) #tokens[i] refers to the word sequence after the word segmentation in line I print('indices:', vocab[tokens[i]]) #Index vocab with tokens[i] to get the index number of each word
#Examples of word segmentation with existing tools text = "Mr. Chen doesn't agree with my suggestion." #spaCy import spacy nlp = spacy.load('en_core_web_sm') doc = nlp(text) print([token.text for token in doc]) #NLTK from nltk.tokenize import word_tokenize from nltk import data data.path.append('/home/kesci/input/nltk_data3784/nltk_data') print(word_tokenize(text))
Learning notes
Build dictionary, set threshold
Special requirement token
1. Count \
2. Add or delete, using empty list
Pad: the length of two-dimensional matrix is different, and pad is used to supplement token in short sentence
bos: start token
eos: end token
Unk: unlisted word as unk
3. Word to index
To build a dictionary:
The main function of the dictionary is to map each word to a unique index number. It mainly constructs an idx_to_token list to store all the words, and a token_to_idx to store the indexes of all the words.
The implementation process is as follows:
- The corpus is segmented to generate a token list, which contains the segmentation results of the corpus
- Count the word frequency of the divided words, and then build a dictionary according to the word frequency (the counted word frequency completes the operation of de duplication, but also retains the word frequency for the convenience of subsequent operation)
Some of these nouns are used in the video
1. The function of pad is to use pad to fill in short samples (sentences) with different length when using batch sample training, so that the length of each sample is the same - BOS (begin of sentence) and eos(end of sentence) are used to indicate the beginning and end of a sentence
3. The function of unknow is to treat the words that never appear in the prediction library as unknow. In the code, some words with extremely low frequency can also be classified into this category
Language model and data set
Language model data set
#Read data set with open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f: corpus_chars = f.read() print(len(corpus_chars)) print(corpus_chars[: 40]) corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ') corpus_chars = corpus_chars[: 10000]
#Build character index idx_to_char = list(set(corpus_chars)) # De duplicate to get index to character mapping char_to_idx = {char: i for i, char in enumerate(idx_to_char)} # Character to index mapping vocab_size = len(char_to_idx) print(vocab_size) corpus_indices = [char_to_idx[char] for char in corpus_chars] # Turn each character into an index to get a sequence of indexes sample = corpus_indices[: 20] print('chars:', ''.join([idx_to_char[idx] for idx in sample])) print('indices:', sample) def load_data_jay_lyrics(): with open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f: corpus_chars = f.read() corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ') corpus_chars = corpus_chars[0:10000] idx_to_char = list(set(corpus_chars)) char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)]) vocab_size = len(char_to_idx) corpus_indices = [char_to_idx[char] for char in corpus_chars] return corpus_indices, char_to_idx, idx_to_char, vocab_size
Sampling of time series data: 1. Random sampling, 2. Adjacent sampling
#Random sampling import torch import random def data_iter_random(corpus_indices, batch_size, num_steps, device=None): # Minus 1 is because for a sequence of length N, X contains at most the first n - 1 characters num_examples = (len(corpus_indices) - 1) // num_steps # The number of samples without overlapping is obtained by rounding example_indices = [i * num_steps for i in range(num_examples)] # The subscript of the first character of each sample in corpus indexes random.shuffle(example_indices) def _data(i): # Returns a sequence of num steps from i return corpus_indices[i: i + num_steps] if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') for i in range(0, num_examples, batch_size): # Each time, select batch_size random samples batch_indices = example_indices[i: i + batch_size] # Subscript of the first character of each sample of the current batch X = [_data(j) for j in batch_indices] Y = [_data(j + 1) for j in batch_indices] yield torch.tensor(X, device=device), torch.tensor(Y, device=device) #Test random sampling my_seq = list(range(30)) for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6): print('X: ', X, '\nY:', Y, '\n')
#Adjacent sampling def data_iter_consecutive(corpus_indices, batch_size, num_steps, device=None): if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') corpus_len = len(corpus_indices) // batch_size * batch_size # The length of the remaining sequence corpus_indices = corpus_indices[: corpus_len] # Only the first corpus'len characters are reserved indices = torch.tensor(corpus_indices, device=device) indices = indices.view(batch_size, -1) # resize into (batch_size,) batch_num = (indices.shape[1] - 1) // num_steps for i in range(batch_num): i = i * num_steps X = indices[:, i: i + num_steps] Y = indices[:, i + 1: i + num_steps + 1] yield X, Y #Test adjacent samples for X, Y in data_iter_consecutive(my_seq, batch_size=2, num_steps=6): print('X: ', X, '\nY:', Y, '\n')
Cyclic neural network from zero
import torch import torch.nn as nn import time import math import sys sys.path.append("/home/kesci/input") import d2l_jay9460 as d2l (corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#One hot vector result = torch.zeros(x.shape[0], n_class, dtype=dtype, device=x.device) # shape: (n, n_class) result.scatter_(1, x.long().view(-1, 1), 1) # result[i, x[i, 0]] = 1 return result x = torch.tensor([0, 2]) x_one_hot = one_hot(x, vocab_size) print(x_one_hot) print(x_one_hot.shape) print(x_one_hot.sum(axis=1)) def to_onehot(X, n_class): return [one_hot(X[:, i], n_class) for i in range(X.shape[1])] X = torch.arange(10).view(2, 5) inputs = to_onehot(X, vocab_size) print(len(inputs), inputs[0].shape)
#Initialize model parameters num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size # num_inputs: d # Num? Hidden: H, the number of hidden cells is a super parameter # num_outputs: q def get_params(): def _one(shape): param = torch.zeros(shape, device=device, dtype=torch.float32) nn.init.normal_(param, 0, 0.01) return torch.nn.Parameter(param) # Hide layer parameters W_xh = _one((num_inputs, num_hiddens)) W_hh = _one((num_hiddens, num_hiddens)) b_h = torch.nn.Parameter(torch.zeros(num_hiddens, device=device)) # Output layer parameters W_hq = _one((num_hiddens, num_outputs)) b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device)) return (W_xh, W_hh, b_h, W_hq, b_q)
#Definition model #Function rnn completes the calculation of each time step of the recurrent neural network in a cyclic way. def rnn(inputs, state, params): # Input and output are both num steps and matrix with shape (batch size, vocab size) W_xh, W_hh, b_h, W_hq, b_q = params H, = state outputs = [] for X in inputs: H = torch.tanh(torch.matmul(X, W_xh) + torch.matmul(H, W_hh) + b_h) Y = torch.matmul(H, W_hq) + b_q outputs.append(Y) return outputs, (H,) #The init RNN state function initializes the hidden variable, where the return value is a tuple. def init_rnn_state(batch_size, num_hiddens, device): return (torch.zeros((batch_size, num_hiddens), device=device), ) #Simple test print(num_hiddens) print(vocab_size) state = init_rnn_state(X.shape[0], num_hiddens, device) inputs = to_onehot(X.to(device), vocab_size) params = get_params() outputs, state_new = rnn(inputs, state, params) print(len(inputs), inputs[0].shape) print(len(outputs), outputs[0].shape) print(len(state), state[0].shape) print(len(state_new), state_new[0].shape)
#Tailoring gradient def grad_clipping(params, theta, device): norm = torch.tensor([0.0], device=device) for param in params: norm += (param.grad.data ** 2).sum() norm = norm.sqrt().item() if norm > theta: for param in params: param.grad.data *= (theta / norm)
#Define prediction function def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx): state = init_rnn_state(1, num_hiddens, device) output = [char_to_idx[prefix[0]]] # output record prefix plus predicted num ﹐ chars characters for t in range(num_chars + len(prefix) - 1): # Take the output of the previous time step as the input of the current time step X = to_onehot(torch.tensor([[output[-1]]], device=device), vocab_size) # Calculate output and update hidden state (Y, state) = rnn(X, state, params) # The next time step is to input the characters in the prefix or the current best prediction character if t < len(prefix) - 1: output.append(char_to_idx[prefix[t + 1]]) else: output.append(Y[0].argmax(dim=1).item()) return ''.join([idx_to_char[i] for i in output]) #Test prediction function predict_rnn('Separate', 10, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx)
#Define model training function def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = nn.CrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: # If adjacent sampling is used, the hidden state is initialized at the beginning of epoch state = init_rnn_state(batch_size, num_hiddens, device) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device) for X, Y in data_iter: if is_random_iter: # If random sampling is used, the hidden state is initialized before each small batch update state = init_rnn_state(batch_size, num_hiddens, device) else: # Otherwise, you need to use the detach function to separate the hidden state from the calculation graph for s in state: s.detach_() # Input is a matrix whose shapes are (batch size, vocab size) inputs = to_onehot(X, vocab_size) # outputs have num steps matrices with the shape (batch size, vocab size) (outputs, state) = rnn(inputs, state, params) # After splicing, the shape is (Num ﹐ steps * batch ﹐ size, vocab ﹐ size) outputs = torch.cat(outputs, dim=0) # The shape of Y is (batch_size, num_steps), which is transformed to # The vector of (Num ﹐ steps * batch ﹐ size,) so that it corresponds to the output line one by one y = torch.flatten(Y.T) # Using cross entropy loss to calculate average classification error l = loss(outputs, y.long()) # Gradient Qing 0 if params[0].grad is not None: for param in params: param.grad.data.zero_() l.backward() grad_clipping(params, clipping_theta, device) # Clipping gradient d2l.sgd(params, lr, 1) # Because the error has been averaged, the gradient does not need to be averaged l_sum += l.item() * y.shape[0] n += y.shape[0] if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % ( epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print(' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx))
#Train models and create lyrics num_epochs, num_steps, batch_size, lr, clipping_theta = 250, 35, 32, 1e2, 1e-2 pred_period, pred_len, prefixes = 50, 50, ['Separate', 'No separation'] #Using random sampling training model and creating lyrics. train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, True, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes) #The model was trained by adjacent sampling and lyrics were created. train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, False, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
Simple realization of cyclic neural network
#Definition model class RNNModel(nn.Module): def __init__(self, rnn_layer, vocab_size): super(RNNModel, self).__init__() self.rnn = rnn_layer self.hidden_size = rnn_layer.hidden_size * (2 if rnn_layer.bidirectional else 1) self.vocab_size = vocab_size self.dense = nn.Linear(self.hidden_size, vocab_size) def forward(self, inputs, state): # inputs.shape: (batch_size, num_steps) X = to_onehot(inputs, vocab_size) X = torch.stack(X) # X.shape: (num_steps, batch_size, vocab_size) hiddens, state = self.rnn(X, state) hiddens = hiddens.view(-1, hiddens.shape[-1]) # hiddens.shape: (num_steps * batch_size, hidden_size) output = self.dense(hiddens) return output, state
#Define prediction function def predict_rnn_pytorch(prefix, num_chars, model, vocab_size, device, idx_to_char, char_to_idx): state = None output = [char_to_idx[prefix[0]]] # output record prefix plus predicted num ﹐ chars characters for t in range(num_chars + len(prefix) - 1): X = torch.tensor([output[-1]], device=device).view(1, 1) (Y, state) = model(X, state) # Forward calculation does not need to pass in model parameters if t < len(prefix) - 1: output.append(char_to_idx[prefix[t + 1]]) else: output.append(Y.argmax(dim=1).item()) return ''.join([idx_to_char[i] for i in output])
#Use a model with a random weight to predict once. model = RNNModel(rnn_layer, vocab_size).to(device) predict_rnn_pytorch('Separate', 10, model, vocab_size, device, idx_to_char, char_to_idx)
#Using adjacent sampling to realize training function def train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): loss = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) model.to(device) for epoch in range(num_epochs): l_sum, n, start = 0.0, 0, time.time() data_iter = d2l.data_iter_consecutive(corpus_indices, batch_size, num_steps, device) # Adjacent sampling state = None for X, Y in data_iter: if state is not None: # Use detach function to separate hidden state from calculation graph if isinstance (state, tuple): # LSTM, state:(h, c) state[0].detach_() state[1].detach_() else: state.detach_() (output, state) = model(X, state) # output.shape: (num_steps * batch_size, vocab_size) y = torch.flatten(Y.T) l = loss(output, y.long()) optimizer.zero_grad() l.backward() grad_clipping(model.parameters(), clipping_theta, device) optimizer.step() l_sum += l.item() * y.shape[0] n += y.shape[0] if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % ( epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print(' -', predict_rnn_pytorch( prefix, pred_len, model, vocab_size, device, idx_to_char, char_to_idx))
#Training model num_epochs, batch_size, lr, clipping_theta = 250, 32, 1e-3, 1e-2 pred_period, pred_len, prefixes = 50, 50, ['Separate', 'No separation'] train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)