Chinese Text Classification (middle) -- implementation of SVM and TextCNN based on keras

Keywords: NLP

Project code
reference resources Know and never forget the original heart

1. Implementation of SVM

The previous section is mainly about the acquisition of Chinese word segmentation and word vector expression. However, each article contains a different number of words, so this paper represents the document by calculating the average word vector, that is, the vectors containing all words in the article are added and averaged, so that a vector pointing to the article information with the same dimension as the word vector can be obtained.

import numpy as np
import pandas as pd
import gensim
# num_feature represents the text word size

def average_word_vectors(words,model,vocabulary,num_features):
    feature_vector=np.zeros((num_features,),dtype='float64')
    nwords=0
    for word in words:
        if word in vocabulary:
            nwords+=1
            feature_vector=np.add(feature_vector,model.key_to_index[word])
    if nwords:
        # Because of averaging
        feature_vector=np.divide(feature_vector,nwords)
    return feature_vector

def average_word_vectorized(corpus,model,num_features):
    vocabulary=set(model.index_to_key)
    features=[average_word_vectors(tokenized_sentence,model,vocabulary,num_features) for tokenized_sentence in corpus]
    return np.array(features)

def get_word_vectors(data):
    words_art=[]
    for i in range(len(data)):
        words_art.append(eval(data.loc[i]))
    return average_word_vectorized(words_art,model=w2v_model,num_features=300)

After calculating the average vector, the following applies to the word segmentation list saved in the form of article in advance.

w2v_model=gensim.models.KeyedVectors.load_word2vec_format('data/word2vec_model.txt',binary=False)
train=pd.read_csv('data/article_features_train.csv')
test=pd.read_csv('data/article_features_test.csv')
x_train=get_word_vectors(train.Words)
y_train=train.label
x_test=get_word_vectors(test.Words)
y_test=test.label

Then the grid search algorithm GridSearchCV is used to find f1_macro's highest model

# Find F1 using GSCV_ Macro's highest model
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
clf=svm.SVC()
grid_values={'gamma':[0.001, 0.01, 0.05, 0.1, 1, 10],
             'C':[0.01, 0.1, 1, 10, 100]}

grid_clf=GridSearchCV(clf,param_grid=grid_values,scoring='f1_macro')
grid_clf.fit(x_train,y_train)
y_grid_pred=grid_clf.predict(x_test)

print('Test set F1: ', f1_score(y_test,y_grid_pred,average='macro'))
print('Grid best parameter (max. f1): ', grid_clf.best_params_)
print('Grid best score (accuracy): ', grid_clf.best_score_)

The output is as follows:

  • Test set F1: 0.35209688361418695
  • Grid best parameter (max. f1): {'C': 100, 'gamma': 1}
  • Grid best score (accuracy): 0.3681357073595844

After setting these parameters, retrain and save the model

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
cm=confusion_matrix(y_test,y_pred)

# Drawing the conflict matrix
print('Confusion Matrix')
category_labels=['Space ','Computer ','Art ', 'Environment ', 'Agriculture ', 'Economy ','Politics ','Sports ','History ']
cm_normalised=cm.astype('float')/cm.sum(axis=1)[:,np.newaxis]
sns.set(font_scale=1.5)
fig,ax=plt.subplots(figsize=(10,10))
ax=sns.heatmap(cm_normalised,annot=True,linewidths=1,square=False,
               cmap='Greens',yticklabels=category_labels,xticklabels=category_labels,
               vmin=0,vmax=np.max(cm_normalised),fmt='.2f',annot_kws={'size':20})
ax.set(xlabel='Predicted label',ylabel='True label')

2. TextCNN implementation

In addition to simple SVM classifier, neural network is also tried here. Although CNN is widely used in image processing, it also has its place in text processing. Next, we will build a TextCNN model to implement the classification task. First, we need to preprocess the word2vec model. The Embedding layer in TextCNN requires us to convert word segmentation into index, so we convert the words in the model into a dictionary and save it in [word: index] for future processing.

# Import w2v model and preprocess
def w2v_model_preprocessing():
    w2v_model=gensim.models.KeyedVectors.load_word2vec_format('data/word2vec_model.txt',binary=False)

    # Initialize the [word:index] dictionary
    word2idx={'_PAD':0}
    vocab_list=[(k,w2v_model.key_to_index[k]) for k,v in w2v_model.key_to_index.items()]
    # Store an array of all vectors in all w2v, including one more bit, and all word vectors are 0, which is used for padding
    embeddings_matrix=np.zeros((len(w2v_model.key_to_index.items())+1,w2v_model.vector_size))
    # Fill dictionary and matrix
    for i in range(len(vocab_list)):
        word=vocab_list[i][0]
        word2idx[word]=i+1
        embeddings_matrix[i+1]=vocab_list[i][1]
    return w2v_model,word2idx,embeddings_matrix

w2v_model,word2idx,embeddings_matrix=w2v_model_preprocessing()

Similarly, we still face the same problem as when building SVM: how to deal with the difference of article length. Here, we consider the solution of truncation, that is, specify a length in advance. If it is insufficient, fill in zero later, and if it exceeds, round off all the contents later, so as to achieve the purpose of consistent length.

from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_words(data):
    words_art=[]
    for i in range(len(data)):
        words_art.append(eval(data.loc[i]))
    return words_art

#The obtained Chinese word segmentation is transformed with the generated dictionary. Get the word segmentation index array with the same length as maxlen. If it exceeds, it will be truncated, and if it is insufficient, it will be followed by zero
#Text is text, word_index is the dictionary and maxlen is the length of the array to be saved
def get_words_index(text,word_index,maxlen):
    texts=get_words(text)
    data=[]
    for sentence in texts:
        new_txt=[]
        for word in sentence:
            try:
                # Convert the participle in the sentence to index
                new_txt.append(word_index[word])
            except:
                new_txt.append(0)
        data.append(new_txt)
    # Use kears' built-in function padding to align sentences
    texts=pad_sequences(data,maxlen=maxlen,padding='post')
    return texts

Next, we read the file and convert the word segmentation of the article into the form of index. The length of each article in the training set, that is, the distribution of the number of words contained, is shown in the figure below.

In this practice, we set the truncation length to 1000.

from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

MAX_LENGTH=1000
# Load training and test sets
train=pd.read_csv('data/article_features_train.csv')
test=pd.read_csv('data/article_features_test.csv')

# Training set data preprocessing
x_train=get_words_index(train.Words,word2idx,MAX_LENGTH)
y_train=train.label
y_train=to_categorical(y_train, num_classes=9) # Save the label in one hot format

# Divide training set and verification set
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train)

# Test set data preprocessing
x_test=get_words_index(test.Words,word2idx,MAX_LENGTH)
y_test=test.label
y_test=to_categorical(y_test, num_classes=9)
print("Dataset load finished.")

After the data is loaded and processed, you can start to build the TextCNN model.

from tensorflow.keras.models import Sequential,Model,load_model
from tensorflow.keras.layers import Dense,Dropout,Activation,Input,Lambda,Reshape,concatenate
from tensorflow.keras.layers import Embedding,Conv1D,MaxPooling1D,GlobalMaxPooling1D,Flatten,BatchNormalization
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.metrics import classification_report,confusion_matrix

from keras.callbacks import ReduceLROnPlateau,TensorBoard,EarlyStopping,ModelCheckpoint
import matplotlib.pyplot as plt


def build_textcnn():
    # Build textCNN model
    # word2vec preprocessing
    w2v_model_preprocessing()

    main_input=Input(shape=(MAX_LENGTH,),dtype='float64')
    # word embedding
    embedder=Embedding(
        len(embeddings_matrix), #Indicates the possible number of words in the text data and how many words are retained from the corpus
        100,# The size of the vector space in which the word is embedded
        input_length=MAX_LENGTH, # Specified length
        weights=[embeddings_matrix],# The length of the input sequence, that is, the number of words with one input
        trainable=False # Set word vectors not to be updated as parameters
    )

    embed=embedder(main_input)

    # The window sizes are 3, 4 and 5 respectively
    cnn1=Conv1D(256,3,padding='same',strides=1,activation='relu',kernel_regularizer=l2(0.05))(embed)
    cnn1 = MaxPooling1D(pool_size=4)(cnn1)
    cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu', kernel_regularizer=l2(0.05))(embed)
    cnn2 = MaxPooling1D(pool_size=4)(cnn2)
    cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu', kernel_regularizer=l2(0.005))(embed)
    cnn3 = MaxPooling1D(pool_size=4)(cnn3)

    # Merge the output vectors of the three models
    cnn=concatenate([cnn1,cnn2,cnn3],axis=-1)
    flat=Flatten()(cnn)
    drop=Dropout(0.5)(flat)
    main_output=Dense(9,activation='softmax')(drop)

    model=Model(inputs=main_input,outputs=main_output)
    model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

    model.summary()
    return model

After the model is built, continue to run the model for training

def run_textcnn(model):

    lr_reducer=ReduceLROnPlateau(monitor='val_loss',factor=0.9, patience=3, verbose=1) # Reduced learning rate
    tensorboard=TensorBoard(log_dir='./logs_textcnn')
    early_stopper=EarlyStopping(monitor='val_loss',min_delta=0,patience=8,verbose=1,mode='auto')
    checkpointer = ModelCheckpoint("weights_textcnn.best.hdf5", monitor='val_loss', verbose=1,
                                   save_best_only=True)  # Add checkpoint

    # model training
    history=model.fit(x_train,y_train,
                      batch_size=64,
                      epochs=10,
                      verbose=1,
                      validation_data=(x_val,y_val),
                      shuffle=True,
                      callbacks=[lr_reducer, checkpointer, tensorboard, early_stopper])

    # Model saving
    model.save('textcnn.h5')
    print('Model Saved!')

    # Save the accuracy and loss of the training set and validation set
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    np_acc = np.array(acc).reshape((1, len(acc)))  # reshape is used to form a matrix with other information for storage
    np_valacc = np.array(val_acc).reshape((1, len(val_acc)))
    np_loss = np.array(loss).reshape((1, len(loss)))
    np_valloss = np.array(val_loss).reshape((1, len(val_loss)))
    np_out = np.concatenate([np_acc, np_valacc, np_loss, np_valloss], axis=0)
    np.savetxt('textcnn_history.txt', np_out)

    print("File Saved!")

    return history

model=build_textcnn()
history=run_textcnn(model)

At this time, the model has been trained. We use it to verify the test set and see how it performs.

#  Verify the test set after training
import h5py
import seaborn as sns
from keras.models import load_model

def evaluate_textcnn(modelpath):
    model=load_model(modelpath)
    y_pred=model.predict(x_test,batch_size=64,verbose=0,steps=None,callbacks=None, max_queue_size=10, workers=1, use_multiprocessing=False)
    y_pred=np.rinit(y_pred)
    cm=confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))

    # Drawing the conflict matrix
    category_labels = ['Space', 'Computer', 'Art', 'Environment', 'Agriculture', 'Economy', 'Politics', 'Sports',
                       'History']
    cm_normalised = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    sns.set(font_scale=1.5)
    fig, ax = plt.subplots(figsize=(10, 10))
    ax = sns.heatmap(cm_normalised, annot=True, linewidths=0, square=False,
                     cmap="Greens", yticklabels=category_labels, xticklabels=category_labels, vmin=0,
                     vmax=np.max(cm_normalised),
                     fmt=".2f", annot_kws={"size": 20})
    ax.set(xlabel='Predicted label', ylabel='True label')

    # Print classification report
    print("Classification Report")
    print(classification_report(y_test, y_pred, digits=4))

Posted by nabeel21 on Fri, 26 Nov 2021 03:54:41 -0800