[TextCNN full version] fast + high accuracy baseline

Keywords: neural networks Deep Learning NLP

preface:
Two months ago, I wrote the complete steps of TextCNN (less than 60 lines of code), but did not take into account the subsequent engineering deployment and large amount of data (unable to load all into memory), so today I made a transformation and optimization according to the actual case.
The operation steps of TextCNN can generally be divided into the following steps:
1. Data sorting: the text in daily work may not directly give you a csv file like the game, and you may need to integrate it yourself; In addition, textcnn does not recognize classified variables (such as Shanghai, Beijing, etc.) during training and prediction, so it must pass map or label_ The encoder method is modified, and the map is updated after the final sample prediction_ Reverse back.
2. Build Thesaurus: tokenizer.fit_on_texts, this step is very important. If the training accuracy is always in single digits, please go to this one to check carefully;
3. Making tf data set: if there are too many text and memory can't fit, it's recommended to go to batch (32 or 64). But it should be noted that if your train_data and valid_data is made into dataset, so test_data must also be made into a dataset. Although there is no label at present, it can be virtualized into 0;
4. Build TextCNN network: there's nothing to say about this, whether it's [2,3,4] or [3,4,5];
5. Setting weight weight: in classification tasks, most of them are unbalanced, especially multi classification, so setting weight weight is still necessary;
6. Training model: adjustable hyperparametric learning_rate (3e-4 recommended), epochs (30-40 recommended, early stop will be set anyway), optimizer (Adam is good), EARLY_STOP_PATIENCE (early stop times, 3 times);
7. Model solidification: model.save('.. / model/text_cnn.h5') can be directly used in tensorflow 2, which will not be demonstrated in this article;
8. Model loading: textcnn_model = tf.keras.models.load_model(‘service/model/text_cnn.h5’)；
9. Sample forecast: text_cnn_model.predict(test_dataset). Note that the result is a floating-point number of 0-1. You need to select the correct label through np.argmax(predictions, axis=-1);

The specific codes are as follows:

1, Import data

import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

#Training data import
train_type_list = []
train_text_list = []

train_dir_name_list = os.listdir('./train/')
train_dir_name_list.remove('.DS_Store')

for dir_name in train_dir_name_list:
    for file in os.listdir('./train/'+dir_name+'/'):
        train_type_list.append(dir_name.split('-')[1])
        train_text_list.append(open('./train/'+str(dir_name)+'/'+str(file),'r',encoding='gb18030',errors='ignore').read().replace('\n', ' ').replace('\u3000', ''))
        
print(len(train_type_list))

#Label dictionary
cls_num = len(set(train_type_list))
cls_dict = {}
for k,v in enumerate(set(train_type_list)):
    cls_dict[k] = v
cls_dict_reverse = {v:k for k,v in cls_dict.items()}

train_data = pd.DataFrame({'text':train_text_list,'target':train_type_list})
train_data['target'] = train_data['target'].map(cls_dict_reverse)
train_data = resample(train_data)
train_data.head()

#Forecast data import
test_text_list = []
test_filename = []
for file in os.listdir('./test'):
    test_filename.append(file)
    test_text_list.append(open('./test/'+file,'r', encoding='gb18030', errors='ignore').read().replace('\n',' '))
test_data = pd.DataFrame({'text':test_text_list, 'filename':test_filename})
test_data['target']=0

2, TF data preparation

X_train, X_val, y_train, y_val = train_test_split(train_data['text'], train_data['target'], test_size=0.1, random_state=27)

#tokenizer
NUM_LABEL = cls_num #Number of categories
BATCH_SIZE = 32
MAX_LEN = 200 #Longest sequence length
BUFFER_SIZE = tf.constant(train_data.shape[0], dtype=tf.int64)

tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(X_train)

def build_tf_dataset(text, label, is_train=False):
    '''make tf data set'''
    sequence = tokenizer.texts_to_sequences(text)
    sequence_padded = tf.keras.preprocessing.sequence.pad_sequences(sequence,padding='post',maxlen=MAX_LEN)
    dataset = tf.data.Dataset.from_tensor_slices((sequence_padded, label))
    if is_train:
        dataset = dataset.shuffle(BUFFER_SIZE)
        dataset = dataset.batch(BATCH_SIZE)
        dataset = dataset.prefetch(BUFFER_SIZE)
    else:
        dataset = dataset.batch(BATCH_SIZE)
        dataset = dataset.prefetch(BATCH_SIZE)
    return dataset

train_dataset = build_tf_dataset(X_train, y_train, is_train=True)
val_dataset = build_tf_dataset(X_val, y_val, is_train=False)
test_dataset = build_tf_dataset(test_data['text'], test_data['target'], is_train=False)

3, Building TextCNN network

VOCAB_SIZE = len(tokenizer.index_word) + 1
print(VOCAB_SIZE)
EMBEDDING_DIM = 100

FILTERS = [3, 4, 5]
NUM_FILTERS = 128 #Size of convolution kernel
DENSE_DIM = 256 #Full connection layer size
CLASS_NUM = 20 #Number of categories
DROPOUT_RATE = 0.5 #dropout scale

def build_text_cnn_model():
    inputs = tf.keras.Input(shape=(None,))
    embed = tf.keras.layers.Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=EMBEDDING_DIM,
        trainable=True,
        mask_zero=True)(inputs)
    embed = tf.keras.layers.Dropout(DROPOUT_RATE)(embed)
    
    pool_outputs = []
    for filter_size in FILTERS:
        conv = tf.keras.layers.Conv1D(NUM_FILTERS,
                                     filter_size,
                                     padding='same',
                                     activation='relu',
                                     data_format='channels_last',
                                     use_bias=True)(embed)
        max_pool = tf.keras.layers.GlobalMaxPooling1D(data_format='channels_last')(conv)
        pool_outputs.append(max_pool)
    
    outputs = tf.keras.layers.concatenate(pool_outputs, axis=-1)
    outputs = tf.keras.layers.Dense(DENSE_DIM, activation='relu')(outputs)
    outputs = tf.keras.layers.Dropout(DROPOUT_RATE)(outputs)
    outputs = tf.keras.layers.Dense(CLASS_NUM, activation='softmax')(outputs)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

text_cnn_model = build_text_cnn_model()
text_cnn_model.summary()

#Set weight weight
df_weight = train_data['target'].value_counts().sort_index().reset_index()
df_weight['weight'] = df_weight['target'].min() / df_weight['target']
df_weight_dict = {k:v for k,v in zip(df_weight['index'], df_weight['weight'])}
df_weight_dict

4, Start training

LR = 3e-4
EPOCHS = 30
EARLY_STOP_PATIENCE = 2
loss = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(LR)

text_cnn_model.compile(loss=loss,
                      optimizer=optimizer,
                      metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                           patience=EARLY_STOP_PATIENCE,
                                           restore_best_weights=True)

history = text_cnn_model.fit(train_dataset,
                            epochs=EPOCHS,
                            callbacks=[callback],
                            validation_data=val_dataset,
                            class_weight=df_weight_dict
                            )

The effect on CPU is not bad, and the accuracy can reach about 90%.

5, Forecast and export results

test_predict = text_cnn_model.predict(test_dataset)
preds = np.argmax(test_predict, axis=-1)
test_data['category'] = preds
test_data['category'] = test_data['category'].map(cls_dict)

test_data[['filename','category']].to_csv('zhanglei.csv', index=False)

Posted by FuriousIrishman on Tue, 30 Nov 2021 19:18:57 -0800

Programmer Group