# Naive Bayes algorithm: News Classification (Sklearn Implementation)

Keywords: Python Lambda encoding github less

1. Steps of news classification based on Naive Bayes

(2) Prepare data

The data set is divided into training set and test set, and the jieba module is used for word segmentation, word frequency statistics, stop word filtering, text feature extraction, and text data is quantified

Stop word text Stopwords. TXT

jieba module learning: https://github.com/fxsjy/jieba ；    https://www.oschina.net/p/jieba

(3) Analyze data: analyze with matplotlib module

(4) Training algorithm: use the MultinomialNB of sklearn.naive_bayes for training

There are three naive Bayesian classification algorithm classes in scikit learn. They are GaussianNB, MultinomialNB and BernoulliNB.

Among them, Gaussian NB is the naive Bayes whose priori is Gaussian distribution, polynomialnb is the naive Bayes whose priori is polynomial distribution, and BernoulliNB is the naive Bayes whose priori is Bernoulli distribution.

(5) Test algorithm: Test Bayesian classifier with test set

2. Code implementation

```# -*- coding: UTF-8 -*-
import os
import random
import jieba
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt

"""
//Function Description: Chinese text processing
Parameters:
folder_path - Path of text storage
test_size - Percentage of test sets, 20% of all data sets by default
Returns:
all_words_list - Training set list sorted in descending order of word frequency
train_data_list - Training set list
test_data_list - Test set list
train_class_list - Training set label list
test_class_list - Test set label list
"""
def TextProcessing(folder_path, test_size=0.2):
folder_list = os.listdir(folder_path)  # View files under folder path
data_list = []  # Dataset data
class_list = []  # Dataset category
# Traverse each subfolder
for folder in folder_list:
new_folder_path = os.path.join(folder_path, folder)  # Generate a new path based on subfolders
files = os.listdir(new_folder_path)  # List of txt files under subfolders
j = 1
# Traverse each txt file
for file in files:
if j > 100:  # Max. 100 samples of each type of txt
break
with open(os.path.join(new_folder_path, file), 'r', encoding='utf-8') as f:  # Open txt file

word_cut = jieba.cut(raw, cut_all=False)  # Reduced mode, return an iterative generator
word_list = list(word_cut)  # generator to list

j += 1
data_class_list = list(zip(data_list, class_list))  # zip compression merges data and labels
random.shuffle(data_class_list)  # Disorder data class list
index = int(len(data_class_list) * test_size) + 1  # Index value of training set and test set segmentation
train_list = data_class_list[index:]  # Training set
test_list = data_class_list[:index]  # Test set
train_data_list, train_class_list = zip(*train_list)  # Training set decompression
test_data_list, test_class_list = zip(*test_list)  # Test set decompression

all_words_dict = {}  # Statistical training set word frequency
for word_list in train_data_list:
for word in word_list:
if word in all_words_dict.keys():
all_words_dict[word] += 1
else:
all_words_dict[word] = 1

# Sort by the value of the key in reverse order
all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True)
all_words_list, all_words_nums = zip(*all_words_tuple_list)  # decompression
all_words_list = list(all_words_list)  # Convert to list
return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list

"""
//Function Description: read the contents of the file and de duplicate
Parameters:
words_file - File path
Returns:
words_set - Of the read content set aggregate
"""
def MakeWordsSet(words_file):
words_set = set()  # Create set set
with open(words_file, 'r', encoding='utf-8') as f:  # Open file
word = line.strip()  # Go back to the train.
if len(word) > 0:  # If there is text, it will be added to words set
return words_set  # Return processing results

"""
//Function Description: text feature selection
Parameters:
all_words_list - Training set all text list
deleteN - Delete the word with the highest frequency deleteN Word
stopwords_set - Specified closing
Returns:
feature_words - Feature set
"""
def words_dict(all_words_list, deleteN, stopwords_set=set()):
feature_words = []  # Feature list
n = 1
for t in range(deleteN, len(all_words_list), 1):
if n > 1000:  # The dimension of feature words is 1000
break
# If the word is not a number, and is not the specified ending, and the word length is greater than 1 and less than 5, then the word can be used as a feature word
if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
feature_words.append(all_words_list[t])
n += 1
return feature_words

"""
//Function Description: quantifies the text according to the feature
Parameters:
train_data_list - Training set
test_data_list - Test set
feature_words - Feature set
Returns:
train_feature_list - Training set vector quantization list
test_feature_list - Test set vectorization list
"""
def TextFeatures(train_data_list, test_data_list, feature_words):
def text_features(text, feature_words):  # If it appears in the feature set, set 1
text_words = set(text)
features = [1 if word in text_words else 0 for word in feature_words]
return features

train_feature_list = [text_features(text, feature_words) for text in train_data_list]
test_feature_list = [text_features(text, feature_words) for text in test_data_list]
return train_feature_list, test_feature_list  # Return result

"""
//Function Description: News classifier
Parameters:
train_feature_list - Training set vectorized feature text
test_feature_list - Test set vectorized feature text
train_class_list - Training set category label
test_class_list - Test set classification label
Returns:
test_accuracy - Classifier accuracy
"""
def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list):
classifier = MultinomialNB().fit(train_feature_list, train_class_list)
test_accuracy = classifier.score(test_feature_list, test_class_list)
return test_accuracy

if __name__ == '__main__':
# Text preprocessing
folder_path = './SogouC/Sample'  # Training set storage address
all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path,test_size=0.2)
# Generate stopwords? Set
stopwords_file = './stopwords_cn.txt'
stopwords_set = MakeWordsSet(stopwords_file)

test_accuracy_list = []
"""
deleteNs = range(0, 1000, 20)  # 0 20 40 60 ... 980
for deleteN in deleteNs:
feature_words = words_dict(all_words_list, deleteN, stopwords_set)
train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
test_accuracy_list.append(test_accuracy)
plt.figure()
plt.plot(deleteNs, test_accuracy_list)
plt.title('Relationship of deleteNs and test_accuracy')
plt.xlabel('deleteNs')
plt.ylabel('test_accuracy')
plt.show()
"""
feature_words = words_dict(all_words_list, 450, stopwords_set)
train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
test_accuracy_list.append(test_accuracy)
ave = lambda c: sum(c) / len(c)
print(ave(test_accuracy_list))
```

The results are as follows:

Posted by AdRock on Thu, 09 Jan 2020 08:16:32 -0800