Naive Bayes algorithm: News Classification (Sklearn Implementation)

Keywords: Python Lambda encoding github less

1. Steps of news classification based on Naive Bayes

(1) Provide text file, i.e Dataset Download

(2) Prepare data

The data set is divided into training set and test set, and the jieba module is used for word segmentation, word frequency statistics, stop word filtering, text feature extraction, and text data is quantified

Stop word text Stopwords. TXT

jieba module learning: ;

(3) Analyze data: analyze with matplotlib module

(4) Training algorithm: use the MultinomialNB of sklearn.naive_bayes for training

         Sklearn builds naive Bayesian classifier for official document learning

There are three naive Bayesian classification algorithm classes in scikit learn. They are GaussianNB, MultinomialNB and BernoulliNB.

Among them, Gaussian NB is the naive Bayes whose priori is Gaussian distribution, polynomialnb is the naive Bayes whose priori is polynomial distribution, and BernoulliNB is the naive Bayes whose priori is Bernoulli distribution.

(5) Test algorithm: Test Bayesian classifier with test set

2. Code implementation

# -*- coding: UTF-8 -*-
import os
import random
import jieba
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
//Function Description: Chinese text processing
    folder_path - Path of text storage
    test_size - Percentage of test sets, 20% of all data sets by default
    all_words_list - Training set list sorted in descending order of word frequency
    train_data_list - Training set list
    test_data_list - Test set list
    train_class_list - Training set label list
    test_class_list - Test set label list
def TextProcessing(folder_path, test_size=0.2):
    folder_list = os.listdir(folder_path)  # View files under folder path
    data_list = []  # Dataset data
    class_list = []  # Dataset category
    # Traverse each subfolder
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)  # Generate a new path based on subfolders
        files = os.listdir(new_folder_path)  # List of txt files under subfolders
        j = 1
        # Traverse each txt file
        for file in files:
            if j > 100:  # Max. 100 samples of each type of txt
            with open(os.path.join(new_folder_path, file), 'r', encoding='utf-8') as f:  # Open txt file
                raw =
            word_cut = jieba.cut(raw, cut_all=False)  # Reduced mode, return an iterative generator
            word_list = list(word_cut)  # generator to list
            data_list.append(word_list)  # Add dataset data
            class_list.append(folder)  # Add dataset category
            j += 1
    data_class_list = list(zip(data_list, class_list))  # zip compression merges data and labels
    random.shuffle(data_class_list)  # Disorder data class list
    index = int(len(data_class_list) * test_size) + 1  # Index value of training set and test set segmentation
    train_list = data_class_list[index:]  # Training set
    test_list = data_class_list[:index]  # Test set
    train_data_list, train_class_list = zip(*train_list)  # Training set decompression
    test_data_list, test_class_list = zip(*test_list)  # Test set decompression
    all_words_dict = {}  # Statistical training set word frequency
    for word_list in train_data_list:
        for word in word_list:
            if word in all_words_dict.keys():
                all_words_dict[word] += 1
                all_words_dict[word] = 1
    # Sort by the value of the key in reverse order
    all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True)
    all_words_list, all_words_nums = zip(*all_words_tuple_list)  # decompression
    all_words_list = list(all_words_list)  # Convert to list
    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list
//Function Description: read the contents of the file and de duplicate
    words_file - File path
    words_set - Of the read content set aggregate
def MakeWordsSet(words_file):
    words_set = set()  # Create set set
    with open(words_file, 'r', encoding='utf-8') as f:  # Open file
        for line in f.readlines():  # Read line by line
            word = line.strip()  # Go back to the train.
            if len(word) > 0:  # If there is text, it will be added to words set
    return words_set  # Return processing results
//Function Description: text feature selection
    all_words_list - Training set all text list
    deleteN - Delete the word with the highest frequency deleteN Word
    stopwords_set - Specified closing
    feature_words - Feature set
def words_dict(all_words_list, deleteN, stopwords_set=set()):
    feature_words = []  # Feature list
    n = 1
    for t in range(deleteN, len(all_words_list), 1):
        if n > 1000:  # The dimension of feature words is 1000
            # If the word is not a number, and is not the specified ending, and the word length is greater than 1 and less than 5, then the word can be used as a feature word
        if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5:
        n += 1
    return feature_words
//Function Description: quantifies the text according to the feature
    train_data_list - Training set
    test_data_list - Test set
    feature_words - Feature set
    train_feature_list - Training set vector quantization list
    test_feature_list - Test set vectorization list
def TextFeatures(train_data_list, test_data_list, feature_words):
    def text_features(text, feature_words):  # If it appears in the feature set, set 1
        text_words = set(text)
        features = [1 if word in text_words else 0 for word in feature_words]
        return features
    train_feature_list = [text_features(text, feature_words) for text in train_data_list]
    test_feature_list = [text_features(text, feature_words) for text in test_data_list]
    return train_feature_list, test_feature_list  # Return result
//Function Description: News classifier
    train_feature_list - Training set vectorized feature text
    test_feature_list - Test set vectorized feature text
    train_class_list - Training set category label
    test_class_list - Test set classification label
    test_accuracy - Classifier accuracy
def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list):
    classifier = MultinomialNB().fit(train_feature_list, train_class_list)
    test_accuracy = classifier.score(test_feature_list, test_class_list)
    return test_accuracy
if __name__ == '__main__':
    # Text preprocessing
    folder_path = './SogouC/Sample'  # Training set storage address
    all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path,test_size=0.2)
    # Generate stopwords? Set
    stopwords_file = './stopwords_cn.txt'
    stopwords_set = MakeWordsSet(stopwords_file)
    test_accuracy_list = []
    deleteNs = range(0, 1000, 20)  # 0 20 40 60 ... 980
    for deleteN in deleteNs:
        feature_words = words_dict(all_words_list, deleteN, stopwords_set)
        train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
        test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
    plt.plot(deleteNs, test_accuracy_list)
    plt.title('Relationship of deleteNs and test_accuracy')
    feature_words = words_dict(all_words_list, 450, stopwords_set)
    train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
    test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
    ave = lambda c: sum(c) / len(c)

The results are as follows:

Posted by AdRock on Thu, 09 Jan 2020 08:16:32 -0800