Two other ways to install fastText
conda install mode: slow
https://anaconda.org/conda-forge/fasttext
windows version can be installed through WHL (fasttext 0.9.1 cp36 cp36m win32.whl), which can be used under windows.
https://www.lfd.uci.edu/~gohlke/pythonlibs/#fasttext
fastText Training
import fastText import fastText import pandas as pd import numpy as np from sklearn.metrics import confusion_matrix,precision_recall_fscore_support
# train ''' dtrain.txt and dtest.txt The data format is as follows: __label__2 China Daily __label__0 The two men were arrested and charged with illegality by the police __label__3 US Army First in Mid-Decade Navigation Process __label__1 A win-win situation for brand users behind the joint efforts of Qiangqiang and Qiangqiang ''' model = fastText.train_supervised( '../data/dtrain.txt', lr=0.1, dim=200, epoch=50, neg=5, wordNgrams=2, label="__label__" ) # Forecast result = model.test('../data/dtest.txt') print('y_pred = ',y_pred)
# Save model model_path = '../model/fastText_model.pkl' model.save_model(model_path)
# metrics for calculating classification #Draw precision, recall, f1-score, support report tables def eval_model(y_true, y_pred, labels): # Calculate recision, Recall, f1, support for each category p, r, f1, s = precision_recall_fscore_support(y_true, y_pred) # Calculate the average Precision, Recall, f1, support of the population tot_p = np.average(p, weights=s) tot_r = np.average(r, weights=s) tot_f1 = np.average(f1, weights=s) tot_s = np.sum(s) res1 = pd.DataFrame({ u'Label': labels, u'Precision': p, u'Recall': r, u'F1': f1, u'Support': s }) res2 = pd.DataFrame({ u'Label': ['population'], u'Precision': [tot_p], u'Recall': [tot_r], u'F1': [tot_f1], u'Support': [tot_s] }) res2.index = [999] res = pd.concat([res1, res2]) return res[['Label', 'Precision', 'Recall', 'F1', 'Support']] cate_dic = {'entertainment': 0, 'technology': 1, 'sports': 2, 'military': 3, 'car': 4} dict_cate = dict(('__label__{}'.format(v),k) for k,v in cate_dic.items()) y_true= [] y_pred = [] with open('../data/dtest.txt','r',encoding='utf-8') as f: for line in f.readlines(): line = line.strip() splits = line.split(" ") label = splits[0] words = [" ".join(splits[1:])] label = dict_cate[label] y_true.append(label) y_pred_results = clf.predict(words)[0][0][0] y_pred.append(dict_cate[y_pred_results]) print("y_true = ",y_true[:5]) print("y_pred = ",y_pred[:5]) print('y_true length = ',len(y_true)) print('y_pred length = ',len(y_pred)) print('keys = ',list(cate_dic.keys()))
y_true = ['sports', 'car', 'car', 'technology', 'entertainment'] y_pred = ['sports', 'car', 'car', 'technology', 'entertainment'] y_true length = 87581 y_pred length = 87581 keys = ['entertainment', 'technology', 'sports', 'military', 'car']
eval_model(y_true,y_pred,list(cate_dic.keys()))
Label | Precision | Recall | F1 | Support | |
---|---|---|---|---|---|
0 | entertainment | 0.934803 | 0.827857 | 0.878086 | 8400 |
1 | technology | 0.906027 | 0.923472 | 0.914666 | 26696 |
2 | sports | 0.881885 | 0.911727 | 0.896558 | 11555 |
3 | military | 0.943886 | 0.931749 | 0.937778 | 22476 |
4 | car | 0.857226 | 0.873252 | 0.865165 | 18454 |
999 | population | 0.905035 | 0.904294 | 0.904270 | 87581 |
Simulated on-line prediction
# Loading model model_path = '../model/fastText_model.pkl' clf = fastText.load_model(model_path)
cate_dic = {'entertainment': 0, 'technology': 1, 'sports': 2, 'military': 3, 'car': 4} print(cate_dic) dict_cate = dict(('__label__{}'.format(v),k) for k,v in cate_dic.items()) print(dict_cate)
{'entertainment': 0, 'technology': 1, 'sports': 2, 'military': 3, 'car': 4} {'__label__0': 'entertainment', '__label__1': 'technology', '__label__2': 'sports', '__label__3': 'military', '__label__4': 'car'}
- Forecast Case 1 - Automobile Category
From today's headlines: https://www.toutiao.com/a6714271125473346055/
import jieba text = "Audi A3,1 series and Mercedes Benz A Three enemies who have been entangled by Grade A" words = [word for word in jieba.lcut(text)] print('words = ',words) data = " ".join(words) # predict results = clf.predict([data]) y_pred = results[0][0][0] print("y_pred results = ",dict_cate[y_pred])
words = ['Audi', 'A3', ',', 'BMW', '1', 'system', 'and', 'Benz', 'A', 'level', 'always', 'Entanglement', 'Endless', 'Of', 'Three', 'Enemy'] y_pred results = car
- Forecast Case 2 - Military Category
From today's headlines: https://www.toutiao.com/a6714188329937535496/
import jieba text = "Who said the relics could only lie in the museum and want to buy a dream fighter for a ride?" words = [word for word in jieba.lcut(text)] print('words = ',words) data = " ".join(words) # predict results = clf.predict([data]) y_pred = results[0][0][0] print("y_pred results = ",dict_cate[y_pred])
words = ['who', 'say', 'Cultural Relic', 'Can only', 'lie', 'stay', 'Museum', ',', 'Think', 'buy', 'One frame', 'Dream', 'in', 'Of', 'Fighter', 'Open', 'A ride', 'Do you', '?'] y_pred results = military
- Forecast Case 3 - Entertainment
From today's headlines: https://www.toutiao.com/a6689675139333751299/ Copy the title for prediction
import jieba text = "A: from perfect Lin Daiyu to her family after hundreds of millions of dollars, she became a legend." words = [word for word in jieba.lcut(text)] print('words = ',words) data = " ".join(words) results = clf.predict([data]) y_pred = results[0][0][0] print("y_pred results = ",dict_cate[y_pred])
words = ['Chen Xiaoxu', ': ', 'from', 'perfect', 'Lin Daiyu', 'reach', 'oneself and one's family', 'too', 'After billion', 'Shaving', 'Become a monk', ',', 'she', 'In the play', 'Off stage', 'all', 'yes', 'Legend'] y_pred results = entertainment
- Forecast Case 4 - Sports
From today's headlines: https://www.toutiao.com/a6714266792253981192/
import jieba text = "Between the sexes there should be a prudent reserve! Guo Ping's main players take part in T2 The men's team of the League stands and eats a table meal for the women's self-help team" words = [word for word in jieba.lcut(text)] print('words = ',words) data = " ".join(words) results = clf.predict([data]) y_pred = results[0][0][0] print("y_pred results = ",dict_cate[y_pred])
words = ['Between the sexes there should be a prudent reserve', '!', 'National ping pong', 'Main force', 'participate in', 'Malaysia', 'T2', 'League match', ' ', 'Men's team', 'station', 'means', 'eat', 'Self-help', 'Women's team', 'eat', 'Table meal'] y_pred results = sports
- Forecast Case 5 - Science and Technology
import jieba text = "Xie One Macro Will be the latest Android One Intelligent mobile phone" words = [word for word in jieba.lcut(text)] print('words = ',words) data = " ".join(words) results = clf.predict([data]) y_pred = results[0][0][0] print("y_pred results = ",dict_cate[y_pred])
words = ['Xie', 'One', ' ', 'Macro', 'take', 'yes', 'Newest', 'A paragraph', 'Android', ' ', 'One', 'Intelligent mobile phone'] y_pred results = technology
Flask Web Services Online Prediction
http://127.0.0.1:5000/v1/p?q=xxxxx
Among them: q is the sample to be predicted
# -*- coding: UTF-8 -*- import jieba import fastText from flask import Flask from flask import request app = Flask(__name__) model_path = '../model/fastText_model.pkl' clf = fastText.load_model(model_path) cate_dic = {'entertainment': 0, 'technology': 1, 'sports': 2, 'military': 3, 'car': 4} dict_cate = dict(('__label__{}'.format(v), k) for k, v in cate_dic.items()) print(dict_cate) @app.route('/') def hello_world(): return 'Hello World!' @app.route('/v1/p', methods=['POST', 'GET']) def predict(): if request.method == 'POST': q = request.form['q'] else: q = request.args.get('q', '') print('q = ', q) print('input data:', q) words = [word for word in jieba.lcut(q)] print('words = ', words) data = " ".join(words) results = clf.predict([data]) y_pred = results[0][0][0] return dict_cate[y_pred] if __name__ == '__main__': app.run()