1. Steps of news classification based on Naive Bayes
(1) Provide text file, i.e Dataset Download
(2) Prepare data
The data set is divided into training set and test set, and the jieba module is used for word segmentation, word frequency statistics, stop word filtering, text feature extraction, and text data is quantified
Stop word text Stopwords. TXT
jieba module learning: https://github.com/fxsjy/jieba ; https://www.oschina.net/p/jieba
(3) Analyze data: analyze with matplotlib module
(4) Training algorithm: use the MultinomialNB of sklearn.naive_bayes for training
Sklearn builds naive Bayesian classifier for official document learning
There are three naive Bayesian classification algorithm classes in scikit learn. They are GaussianNB, MultinomialNB and BernoulliNB.
Among them, Gaussian NB is the naive Bayes whose priori is Gaussian distribution, polynomialnb is the naive Bayes whose priori is polynomial distribution, and BernoulliNB is the naive Bayes whose priori is Bernoulli distribution.
(5) Test algorithm: Test Bayesian classifier with test set
2. Code implementation
# -*- coding: UTF-8 -*- import os import random import jieba from sklearn.naive_bayes import MultinomialNB import matplotlib.pyplot as plt """ //Function Description: Chinese text processing Parameters: folder_path - Path of text storage test_size - Percentage of test sets, 20% of all data sets by default Returns: all_words_list - Training set list sorted in descending order of word frequency train_data_list - Training set list test_data_list - Test set list train_class_list - Training set label list test_class_list - Test set label list """ def TextProcessing(folder_path, test_size=0.2): folder_list = os.listdir(folder_path) # View files under folder path data_list = [] # Dataset data class_list = [] # Dataset category # Traverse each subfolder for folder in folder_list: new_folder_path = os.path.join(folder_path, folder) # Generate a new path based on subfolders files = os.listdir(new_folder_path) # List of txt files under subfolders j = 1 # Traverse each txt file for file in files: if j > 100: # Max. 100 samples of each type of txt break with open(os.path.join(new_folder_path, file), 'r', encoding='utf-8') as f: # Open txt file raw = f.read() word_cut = jieba.cut(raw, cut_all=False) # Reduced mode, return an iterative generator word_list = list(word_cut) # generator to list data_list.append(word_list) # Add dataset data class_list.append(folder) # Add dataset category j += 1 data_class_list = list(zip(data_list, class_list)) # zip compression merges data and labels random.shuffle(data_class_list) # Disorder data class list index = int(len(data_class_list) * test_size) + 1 # Index value of training set and test set segmentation train_list = data_class_list[index:] # Training set test_list = data_class_list[:index] # Test set train_data_list, train_class_list = zip(*train_list) # Training set decompression test_data_list, test_class_list = zip(*test_list) # Test set decompression all_words_dict = {} # Statistical training set word frequency for word_list in train_data_list: for word in word_list: if word in all_words_dict.keys(): all_words_dict[word] += 1 else: all_words_dict[word] = 1 # Sort by the value of the key in reverse order all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True) all_words_list, all_words_nums = zip(*all_words_tuple_list) # decompression all_words_list = list(all_words_list) # Convert to list return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list """ //Function Description: read the contents of the file and de duplicate Parameters: words_file - File path Returns: words_set - Of the read content set aggregate """ def MakeWordsSet(words_file): words_set = set() # Create set set with open(words_file, 'r', encoding='utf-8') as f: # Open file for line in f.readlines(): # Read line by line word = line.strip() # Go back to the train. if len(word) > 0: # If there is text, it will be added to words set words_set.add(word) return words_set # Return processing results """ //Function Description: text feature selection Parameters: all_words_list - Training set all text list deleteN - Delete the word with the highest frequency deleteN Word stopwords_set - Specified closing Returns: feature_words - Feature set """ def words_dict(all_words_list, deleteN, stopwords_set=set()): feature_words = [] # Feature list n = 1 for t in range(deleteN, len(all_words_list), 1): if n > 1000: # The dimension of feature words is 1000 break # If the word is not a number, and is not the specified ending, and the word length is greater than 1 and less than 5, then the word can be used as a feature word if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5: feature_words.append(all_words_list[t]) n += 1 return feature_words """ //Function Description: quantifies the text according to the feature Parameters: train_data_list - Training set test_data_list - Test set feature_words - Feature set Returns: train_feature_list - Training set vector quantization list test_feature_list - Test set vectorization list """ def TextFeatures(train_data_list, test_data_list, feature_words): def text_features(text, feature_words): # If it appears in the feature set, set 1 text_words = set(text) features = [1 if word in text_words else 0 for word in feature_words] return features train_feature_list = [text_features(text, feature_words) for text in train_data_list] test_feature_list = [text_features(text, feature_words) for text in test_data_list] return train_feature_list, test_feature_list # Return result """ //Function Description: News classifier Parameters: train_feature_list - Training set vectorized feature text test_feature_list - Test set vectorized feature text train_class_list - Training set category label test_class_list - Test set classification label Returns: test_accuracy - Classifier accuracy """ def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list): classifier = MultinomialNB().fit(train_feature_list, train_class_list) test_accuracy = classifier.score(test_feature_list, test_class_list) return test_accuracy if __name__ == '__main__': # Text preprocessing folder_path = './SogouC/Sample' # Training set storage address all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path,test_size=0.2) # Generate stopwords? Set stopwords_file = './stopwords_cn.txt' stopwords_set = MakeWordsSet(stopwords_file) test_accuracy_list = [] """ deleteNs = range(0, 1000, 20) # 0 20 40 60 ... 980 for deleteN in deleteNs: feature_words = words_dict(all_words_list, deleteN, stopwords_set) train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words) test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list) test_accuracy_list.append(test_accuracy) plt.figure() plt.plot(deleteNs, test_accuracy_list) plt.title('Relationship of deleteNs and test_accuracy') plt.xlabel('deleteNs') plt.ylabel('test_accuracy') plt.show() """ feature_words = words_dict(all_words_list, 450, stopwords_set) train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words) test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list) test_accuracy_list.append(test_accuracy) ave = lambda c: sum(c) / len(c) print(ave(test_accuracy_list))
The results are as follows: