(NLP): 07 fastText training Chinese model text categorization

Keywords: Programming Windows Android Mobile Anaconda

Two other ways to install fastText

conda install mode: slow

https://anaconda.org/conda-forge/fasttext

windows version can be installed through WHL (fasttext 0.9.1 cp36 cp36m win32.whl), which can be used under windows.

https://www.lfd.uci.edu/~gohlke/pythonlibs/#fasttext

fastText Training

import fastText
import fastText
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix,precision_recall_fscore_support

# train
'''

dtrain.txt and dtest.txt The data format is as follows:

__label__2 China Daily
__label__0 The two men were arrested and charged with illegality by the police
__label__3 US Army First in Mid-Decade Navigation Process
__label__1 A win-win situation for brand users behind the joint efforts of Qiangqiang and Qiangqiang
'''
model = fastText.train_supervised(
    '../data/dtrain.txt',
    lr=0.1,
    dim=200,
    epoch=50,
    neg=5,
    wordNgrams=2,
    label="__label__"
    )
# Forecast
result = model.test('../data/dtest.txt')
print('y_pred = ',y_pred)

# Save model
model_path = '../model/fastText_model.pkl'
model.save_model(model_path)

# metrics for calculating classification
#Draw precision, recall, f1-score, support report tables
def eval_model(y_true, y_pred, labels):
    # Calculate recision, Recall, f1, support for each category
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)
    # Calculate the average Precision, Recall, f1, support of the population
    tot_p = np.average(p, weights=s)
    tot_r = np.average(r, weights=s)
    tot_f1 = np.average(f1, weights=s)
    tot_s = np.sum(s)
    res1 = pd.DataFrame({
        u'Label': labels,
        u'Precision': p,
        u'Recall': r,
        u'F1': f1,
        u'Support': s
    })
    res2 = pd.DataFrame({
        u'Label': ['population'],
        u'Precision': [tot_p],
        u'Recall': [tot_r],
        u'F1': [tot_f1],
        u'Support': [tot_s]
    })
    res2.index = [999]
    res = pd.concat([res1, res2])
    return res[['Label', 'Precision', 'Recall', 'F1', 'Support']]

cate_dic = {'entertainment': 0, 'technology': 1, 'sports': 2, 'military': 3, 'car': 4}
dict_cate = dict(('__label__{}'.format(v),k) for k,v in cate_dic.items())
y_true= []
y_pred = []
with open('../data/dtest.txt','r',encoding='utf-8') as f:
    for line in f.readlines():
        line = line.strip()
        splits = line.split(" ")
        label = splits[0]
        words = [" ".join(splits[1:])]
        label = dict_cate[label]
        y_true.append(label)
        y_pred_results = clf.predict(words)[0][0][0]
        y_pred.append(dict_cate[y_pred_results])
print("y_true = ",y_true[:5])
print("y_pred = ",y_pred[:5])
print('y_true length = ',len(y_true))
print('y_pred length = ',len(y_pred))

print('keys = ',list(cate_dic.keys()))

y_true =  ['sports', 'car', 'car', 'technology', 'entertainment']
y_pred =  ['sports', 'car', 'car', 'technology', 'entertainment']
y_true length =  87581
y_pred length =  87581
keys =  ['entertainment', 'technology', 'sports', 'military', 'car']

eval_model(y_true,y_pred,list(cate_dic.keys()))

	Label	Precision	Recall	F1	Support
0	entertainment	0.934803	0.827857	0.878086	8400
1	technology	0.906027	0.923472	0.914666	26696
2	sports	0.881885	0.911727	0.896558	11555
3	military	0.943886	0.931749	0.937778	22476
4	car	0.857226	0.873252	0.865165	18454
999	population	0.905035	0.904294	0.904270	87581

Simulated on-line prediction

# Loading model
model_path = '../model/fastText_model.pkl'
clf = fastText.load_model(model_path)

cate_dic = {'entertainment': 0, 'technology': 1, 'sports': 2, 'military': 3, 'car': 4}
print(cate_dic)

dict_cate = dict(('__label__{}'.format(v),k) for k,v in cate_dic.items()) 
print(dict_cate)

{'entertainment': 0, 'technology': 1, 'sports': 2, 'military': 3, 'car': 4}
{'__label__0': 'entertainment', '__label__1': 'technology', '__label__2': 'sports', '__label__3': 'military', '__label__4': 'car'}

Forecast Case 1 - Automobile Category
From today's headlines: https://www.toutiao.com/a6714271125473346055/

import jieba
text = "Audi A3,1 series and Mercedes Benz A Three enemies who have been entangled by Grade A"
words = [word for word in jieba.lcut(text)]
print('words = ',words)
data = " ".join(words)

# predict
results = clf.predict([data])
y_pred = results[0][0][0]
print("y_pred results = ",dict_cate[y_pred])

words =  ['Audi', 'A3', ',', 'BMW', '1', 'system', 'and', 'Benz', 'A', 'level', 'always', 'Entanglement', 'Endless', 'Of', 'Three', 'Enemy']
y_pred results =  car

Forecast Case 2 - Military Category
From today's headlines: https://www.toutiao.com/a6714188329937535496/

import jieba
text = "Who said the relics could only lie in the museum and want to buy a dream fighter for a ride?"
words = [word for word in jieba.lcut(text)]
print('words = ',words)
data = " ".join(words)
# predict
results = clf.predict([data])
y_pred = results[0][0][0]
print("y_pred results = ",dict_cate[y_pred])

words =  ['who', 'say', 'Cultural Relic', 'Can only', 'lie', 'stay', 'Museum', '，', 'Think', 'buy', 'One frame', 'Dream', 'in', 'Of', 'Fighter', 'Open', 'A ride', 'Do you', '？']
y_pred results =  military

Forecast Case 3 - Entertainment
From today's headlines: https://www.toutiao.com/a6689675139333751299/ Copy the title for prediction

import jieba
text = "A: from perfect Lin Daiyu to her family after hundreds of millions of dollars, she became a legend."
words = [word for word in jieba.lcut(text)]
print('words = ',words)
data = " ".join(words)
results = clf.predict([data])
y_pred = results[0][0][0]
print("y_pred results = ",dict_cate[y_pred])

words =  ['Chen Xiaoxu', ': ', 'from', 'perfect', 'Lin Daiyu', 'reach', 'oneself and one's family', 'too', 'After billion', 'Shaving', 'Become a monk', '，', 'she', 'In the play', 'Off stage', 'all', 'yes', 'Legend']
y_pred results =  entertainment

Forecast Case 4 - Sports
From today's headlines: https://www.toutiao.com/a6714266792253981192/

import jieba
text = "Between the sexes there should be a prudent reserve! Guo Ping's main players take part in T2 The men's team of the League stands and eats a table meal for the women's self-help team"
words = [word for word in jieba.lcut(text)]
print('words = ',words)
data = " ".join(words)
results = clf.predict([data])
y_pred = results[0][0][0]
print("y_pred results = ",dict_cate[y_pred])

words =  ['Between the sexes there should be a prudent reserve', '!', 'National ping pong', 'Main force', 'participate in', 'Malaysia', 'T2', 'League match', ' ', 'Men's team', 'station', 'means', 'eat', 'Self-help', 'Women's team', 'eat', 'Table meal']
y_pred results =  sports

Forecast Case 5 - Science and Technology

import jieba
text = "Xie One Macro Will be the latest Android One Intelligent mobile phone"
words = [word for word in jieba.lcut(text)]
print('words = ',words)
data = " ".join(words)
results = clf.predict([data])
y_pred = results[0][0][0]
print("y_pred results = ",dict_cate[y_pred])

words =  ['Xie', 'One', ' ', 'Macro', 'take', 'yes', 'Newest', 'A paragraph', 'Android', ' ', 'One', 'Intelligent mobile phone']
y_pred results =  technology

Flask Web Services Online Prediction

http://127.0.0.1:5000/v1/p?q=xxxxx

Among them: q is the sample to be predicted

# -*- coding: UTF-8 -*-
import jieba
import fastText
from flask import Flask
from flask import request

app = Flask(__name__)
model_path = '../model/fastText_model.pkl'
clf = fastText.load_model(model_path)
cate_dic = {'entertainment': 0, 'technology': 1, 'sports': 2, 'military': 3, 'car': 4}
dict_cate = dict(('__label__{}'.format(v), k) for k, v in cate_dic.items())
print(dict_cate)


@app.route('/')
def hello_world():
    return 'Hello World!'


@app.route('/v1/p', methods=['POST', 'GET'])
def predict():
    if request.method == 'POST':
        q = request.form['q']
    else:
        q = request.args.get('q', '')
        print('q = ', q)

    print('input data:', q)
    words = [word for word in jieba.lcut(q)]
    print('words = ', words)
    data = " ".join(words)
    results = clf.predict([data])
    y_pred = results[0][0][0]
    return dict_cate[y_pred]

if __name__ == '__main__':
    app.run()

Posted by Unforgiven on Sun, 06 Oct 2019 11:44:46 -0700

Programmer Group

(NLP): 07 fastText training Chinese model text categorization

Two other ways to install fastText

fastText Training

Simulated on-line prediction

Flask Web Services Online Prediction

Hot Keywords