Text preprocessing
Common four steps:
- Read in text
- participle
- Build a dictionary to map each word to a unique index
- Convert the text from the sequence of words to the sequence of indexes to facilitate the input of models
import collections import re def read_time_machine(): with open('/home/kesci/input/timemachine7163/timemachine.txt', 'r') as f: lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f] return lines lines = read_time_machine() print('# sentences %d' % len(lines)) def tokenize(sentences, token='word'): """Split sentences into word or char tokens""" if token == 'word': return [sentence.split(' ') for sentence in sentences] elif token == 'char': return [list(sentence) for sentence in sentences] else: print('ERROR: unkown token type '+token) tokens = tokenize(lines) tokens[0:2] class Vocab(object): def __init__(self, tokens, min_freq=0, use_special_tokens=False): counter = count_corpus(tokens) # : self.token_freqs = list(counter.items()) self.idx_to_token = [] if use_special_tokens: # padding, begin of sentence, end of sentence, unknown self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3) self.idx_to_token += ['', '', '', ''] else: self.unk = 0 self.idx_to_token += [''] self.idx_to_token += [token for token, freq in self.token_freqs if freq >= min_freq and token not in self.idx_to_token] self.token_to_idx = dict() for idx, token in enumerate(self.idx_to_token): self.token_to_idx[token] = idx def __len__(self): return len(self.idx_to_token) def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens] def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices] def count_corpus(sentences): tokens = [tk for st in sentences for tk in st] return collections.Counter(tokens) # Returns a dictionary that records the number of occurrences of each word
Language model
The n-element model is mainly considered:
with open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f: corpus_chars = f.read() print(len(corpus_chars)) print(corpus_chars[: 40]) corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ') corpus_chars = corpus_chars[: 10000] # Build character index idx_to_char = list(set(corpus_chars)) # De duplicate to get index to character mapping char_to_idx = {char: i for i, char in enumerate(idx_to_char)} # Character to index mapping vocab_size = len(char_to_idx) print(vocab_size) corpus_indices = [char_to_idx[char] for char in corpus_chars] # Turn each character into an index to get a sequence of indexes sample = corpus_indices[: 20] print('chars:', ''.join([idx_to_char[idx] for idx in sample])) print('indices:', sample) def load_data_jay_lyrics(): with open('/home/kesci/input/jaychou_lyrics4703/jaychou_lyrics.txt') as f: corpus_chars = f.read() corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ') corpus_chars = corpus_chars[0:10000] idx_to_char = list(set(corpus_chars)) char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)]) vocab_size = len(char_to_idx) corpus_indices = [char_to_idx[char] for char in corpus_chars] return corpus_indices, char_to_idx, idx_to_char, vocab_size import torch import random def data_iter_random(corpus_indices, batch_size, num_steps, device=None): # Minus 1 is because for a sequence of length N, X contains at most the first n - 1 characters num_examples = (len(corpus_indices) - 1) // Get the number of samples without overlapping by rounding under num steps example_indices = [i * num_steps for i in range(num_examples)] # The subscript of the first character of each sample in corpus indexes random.shuffle(example_indices) def _data(i): # Returns a sequence of num steps from i return corpus_indices[i: i + num_steps] if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') for i in range(0, num_examples, batch_size): # Each time, select batch_size random samples batch_indices = example_indices[i: i + batch_size] # Subscript of the first character of each sample of the current batch X = [_data(j) for j in batch_indices] Y = [_data(j + 1) for j in batch_indices] yield torch.tensor(X, device=device), torch.tensor(Y, device=device) my_seq = list(range(30)) for X, Y in data_iter_random(my_seq, batch_size=2, num_steps=6): print('X: ', X, '\nY:', Y, '\n')
Cyclic neural network:
The next step is to realize the cyclic neural network from zero
import torch import torch.nn as nn import time import math import sys sys.path.append("/home/kesci/input") import d2l_jay9460 as d2l (corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def one_hot(x, n_class, dtype=torch.float32): result = torch.zeros(x.shape[0], n_class, dtype=dtype, device=x.device) # shape: (n, n_class) result.scatter_(1, x.long().view(-1, 1), 1) # result[i, x[i, 0]] = 1 return result x = torch.tensor([0, 2]) x_one_hot = one_hot(x, vocab_size) print(x_one_hot) print(x_one_hot.shape) print(x_one_hot.sum(axis=1)) def to_onehot(X, n_class): return [one_hot(X[:, i], n_class) for i in range(X.shape[1])] X = torch.arange(10).view(2, 5) inputs = to_onehot(X, vocab_size) print(len(inputs), inputs[0].shape) num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size # num_inputs: d # Num? Hidden: H, the number of hidden cells is a super parameter # num_outputs: q def get_params(): def _one(shape): param = torch.zeros(shape, device=device, dtype=torch.float32) nn.init.normal_(param, 0, 0.01) return torch.nn.Parameter(param) # Hide layer parameters W_xh = _one((num_inputs, num_hiddens)) W_hh = _one((num_hiddens, num_hiddens)) b_h = torch.nn.Parameter(torch.zeros(num_hiddens, device=device)) # Output layer parameters W_hq = _one((num_hiddens, num_outputs)) b_q = torch.nn.Parameter(torch.zeros(num_outputs, device=device)) return (W_xh, W_hh, b_h, W_hq, b_q) def rnn(inputs, state, params): # Input and output are both num steps and matrix with shape (batch size, vocab size) W_xh, W_hh, b_h, W_hq, b_q = params H, = state outputs = [] for X in inputs: H = torch.tanh(torch.matmul(X, W_xh) + torch.matmul(H, W_hh) + b_h) Y = torch.matmul(H, W_hq) + b_q outputs.append(Y) return outputs, (H,) def init_rnn_state(batch_size, num_hiddens, device): return (torch.zeros((batch_size, num_hiddens), device=device), ) print(X.shape) print(num_hiddens) print(vocab_size) state = init_rnn_state(X.shape[0], num_hiddens, device) inputs = to_onehot(X.to(device), vocab_size) params = get_params() outputs, state_new = rnn(inputs, state, params) print(len(inputs), inputs[0].shape) print(len(outputs), outputs[0].shape) print(len(state), state[0].shape) print(len(state_new), state_new[0].shape) def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx): state = init_rnn_state(1, num_hiddens, device) output = [char_to_idx[prefix[0]]] # output record prefix plus predicted num ﹐ chars characters for t in range(num_chars + len(prefix) - 1): # Take the output of the previous time step as the input of the current time step X = to_onehot(torch.tensor([[output[-1]]], device=device), vocab_size) # Calculate output and update hidden state (Y, state) = rnn(X, state, params) # The next time step is to input the characters in the prefix or the current best prediction character if t < len(prefix) - 1: output.append(char_to_idx[prefix[t + 1]]) else: output.append(Y[0].argmax(dim=1).item()) return ''.join([idx_to_char[i] for i in output])
Published 52 original articles, won praise and 10000 visitors+