Machine learning 27:svm / decision tree / random forest / knn classification iris data set

Keywords: Database Python

Machine learning 27:svm / decision tree / random forest / knn classification iris data set

In this paper, we call sklearn library to call svm/knn / decision tree / random forest to realize the classification of iris data set. The main purpose is to be familiar with the processing flow.

1.svm classification of iris dataset:

# File function: svm classification of iris dataset
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

#[1] Read data set
data = load_iris()

#[2] Divide data and labels
x = data.data[:, :2]
y = data.target
train_data, test_data, train_label, test_label = train_test_split\
                    (x, y, random_state=1, train_size=0.6, test_size=0.4)
print(train_data.shape)

#[3] Training svm classifier
classifier = svm.SVC(C=2, kernel='rbf', gamma=10, decision_function_shape='ovo') #ovr: one to many strategy
classifier.fit(train_data, train_label.ravel()) #When the dimension of t ravel function is reduced, row order takes precedence by default

#[4] Calculate the accuracy of classifier
print("Training set:", classifier.score(train_data, train_label))
print("Test set:", classifier.score(test_data, test_label))

#[5] The accuracy rate can be calculated by directly calling accuracy score
tra_label = classifier.predict(train_data)      #Prediction label of training set
tes_label = classifier.predict(test_data)       #Forecast labels for test sets
print("Training set:", accuracy_score(train_label, tra_label))
print("Test set:", accuracy_score(test_label, tes_label))


#[6] View decision functions
print('train_decision_function:\n', classifier.decision_function(train_data))     # (90,3)  
print('predict_result:\n', classifier.predict(train_data))

See svm for details Support vector machine related blog.

2. Data set of iris classified by KNN:

# File function: realize iris data set classification with knn
from sklearn import datasets                           # Introduce many datasets contained in sklearn
from sklearn.model_selection import train_test_split   # Divide data into test set and training set
from sklearn.neighbors import KNeighborsClassifier     # Training data with knn


# [1] Import training data
iris = datasets.load_iris() # Iris iris data is introduced. Iris data contains 4 characteristic variables
iris_X = iris.data          # Characteristic variable
iris_y = iris.target        # target value
# Use train test split to separate training set and test set, with test size accounting for 30%
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.8)
print(y_train)              #The eigenvalues of training data are divided into 3 categories


# [2] Executive Training
knn = KNeighborsClassifier()   # Introducing training methods
knn.fit(X_train, y_train)      # Fill in test data for training


# [3] Forecast data
print(knn.predict(X_test))      # Predicted eigenvalue
print(y_test)                   # True eigenvalue


# [4] The accuracy rate can be calculated by directly calling accuracy score
from sklearn.metrics import accuracy_score
print("Test accuracy:", accuracy_score(knn.predict(X_test), y_test))

3. Data set of iris in random forest classification:

# File function: Iris data set of random forest classification
"""
//Random forest is mainly used in regression and classification, focusing on classification. Random forest is a method that uses multiple trees to train, classify and predict sample data.
//It not only classifies the data, but also gives the importance score of each variable and evaluates the role of each variable in the classification.
"""
"""
//Construction of random forest:
1.First use bootstrap Methods random sampling from the original training set n Samples and build n Decision trees;
2.Then suppose that in the training sample data m The best feature is selected for each split, and each tree is split like this until the node
3.All the training samples belong to the same class, and then each decision tree can grow to the maximum without any pruning;
4.At last, the new data are classified and regressed by using the random forest classifier. For the classification problem, the final classification result is determined by the vote of multiple tree classifiers; for the regression problem, the final prediction result is determined by the mean value of multiple tree prediction values
"""

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris


RF = RandomForestClassifier(n_estimators=100, n_jobs=4, oob_score=True)
iris = load_iris()
x = iris.data[:, :2]
y = iris.target
RF.fit(x, y)
h = .02
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weight in ['uniform', 'distance']:
    x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1
    y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1
    xx, yy = np.meshgrid(
        np.arange(x_min, x_max, h),
        np.arange(y_min, y_max, h)
    )
    z = RF.predict(np.c_[xx.ravel(), yy.ravel()])
    z = z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, z, cmap=cmap_light)
    plt.scatter(x[:, 0], x[:, 1], c=y, cmap=cmap_bold, edgecolors='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.title('RandomForestClassifier')
plt.show()
print('RandomForestClassifier:', RF.score(x, y))

This code is excerpted from blog Using random forest algorithm to realize iris case.

4. Decision tree classification iris data set:

(1) call the sklearn database to classify the iris dataset:

from sklearn import datasets                         # Import method class
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# [1] Load data set
iris = datasets.load_iris()                         # Load iris dataset
iris_feature = iris.data                            # Characteristic data
iris_target = iris.target                           # Categorical data


# [2] Dataset partition
feature_train, feature_test, target_train, target_test = train_test_split(iris_feature, iris_target, test_size=0.33, random_state=42)


# [3] Training model
dt_model = DecisionTreeClassifier()                 # All parameters are set to default state
dt_model.fit(feature_train,target_train)            # Using training sets to train models
predict_results = dt_model.predict(feature_test)    # Using models to predict test sets


# [4] Result evaluation
scores = dt_model.score(feature_test, target_test)
print(scores)

(2) user defined function classification iris data set:

This code is excerpted from blog Iris data classified by decision tree , the implementation idea is very clear, and the implementation mechanism of the decision tree can be clearly understood with other principle class decision tree data such as watermelon book.

# File function: decision tree classification iris data set
# Code overall idea:
# 1. Process the data first, and shuffle function randomly selects 80% of the samples as the training set.
# 2. Eigenvalue discretization
# 3. Use information entropy to construct tree recursively
# 4. Use the constructed tree to judge the remaining 20% of the test set, and find the accuracy of the algorithm for classification


from sklearn import datasets
import math
import numpy as np


# [1] Getting information entropy
def getInformationEntropy(arr, leng):
    return -(arr[0] / leng * math.log(arr[0] / leng if arr[0] > 0 else 1) + arr[1] / leng * math.log(
        arr[1] / leng if arr[1] > 0 else 1) + arr[2] / leng * math.log(arr[2] / leng if arr[2] > 0 else 1))



# [2] The value of discrete characteristic one
def discretization(index):
    feature1 = np.array([iris.data[:, index], iris.target]).T
    feature1 = feature1[feature1[:, 0].argsort()]

    counter1 = np.array([0, 0, 0])
    counter2 = np.array([0, 0, 0])

    resEntropy = 100000
    for i in range(len(feature1[:, 0])):
        counter1[int(feature1[i, 1])] = counter1[int(feature1[i, 1])] + 1
        counter2 = np.copy(counter1)
        for j in range(i + 1, len(feature1[:, 0])):
            counter2[int(feature1[j, 1])] = counter2[int(feature1[j, 1])] + 1
            # print(i,j,counter1,counter2)
            # Greedy algorithm for optimal cutting point
            if i != j and j != len(feature1[:, 0]) - 1:
                sum = (i + 1) * getInformationEntropy(counter1, i + 1) + (j - i) * getInformationEntropy(
                    counter2 - counter1, j - i) + (length - j - 1) * getInformationEntropy(np.array(num) - counter2,                                                                               length - j - 1)
                if sum < resEntropy:
                    resEntropy = sum
                    res = np.array([i, j])
    res_value = [feature1[res[0], 0], feature1[res[1], 0]]
    print(res, resEntropy, res_value)
    return res_value


# [3] Calculate the appropriate split value
def getRazors():
    a = []
    for i in range(len(iris.feature_names)):
        print(i)
        a.append(discretization(i))
    return np.array(a)


# [4] Randomly select 80% of training sets and 20% of test sets
def divideData():
    completeData = np.c_[iris.data, iris.target.T]
    np.random.shuffle(completeData)
    trainData = completeData[range(int(length * 0.8)), :]
    testData = completeData[range(int(length * 0.8), length), :]
    return [trainData, testData]


# [5]
def getEntropy(counter):
    res = 0
    denominator = np.sum(counter)
    if denominator == 0:
        return 0
    for value in counter:
        if value == 0:
            continue
        res += value / denominator * math.log(value / denominator if value > 0 and denominator > 0 else 1)
    return -res

# [6] Find maximum index
def findMaxIndex(dataSet):
    maxIndex = 0
    maxValue = -1
    for index, value in enumerate(dataSet):
        if value > maxValue:
            maxIndex = index
            maxValue = value
    return maxIndex



# [7] Recursion
def recursion(featureSet, dataSet, counterSet):
    if (counterSet[0] == 0 and counterSet[1] == 0 and counterSet[2] != 0):
        return iris.target_names[2]
    if (counterSet[0] != 0 and counterSet[1] == 0 and counterSet[2] == 0):
        return iris.target_names[0]
    if (counterSet[0] == 0 and counterSet[1] != 0 and counterSet[2] == 0):
        return iris.target_names[1]
    if len(featureSet) == 0:
        return iris.target_names[findMaxIndex(counterSet)]
    if len(dataSet) == 0:
        return []
    res = 1000
    final = 0
    # print("number of remaining features", len(featureSet))
    for feature in featureSet:
        i = razors[feature][0]
        j = razors[feature][1]
        # print("i = ",i," j = ",j)
        set1 = []
        set2 = []
        set3 = []
        counter1 = [0, 0, 0]
        counter2 = [0, 0, 0]
        counter3 = [0, 0, 0]
        for data in dataSet:
            index = int(data[-1])
            # print("data ",data," index ",index)

            if data[feature] < i:
                set1.append(data)
                counter1[index] = counter1[index] + 1
            elif data[feature] >= i and data[feature] <= j:
                set2.append(data)
                counter2[index] = counter2[index] + 1
            else:
                set3.append(data)
                counter3[index] = counter3[index] + 1

        a = (len(set1) * getEntropy(counter1) + len(set2) * getEntropy(counter2) + len(set3) * getEntropy(
            counter3)) / len(dataSet)
        # print("feature No.:, feature," entropy of information obtained by selecting this feature: ", a)
        if a < res:
            res = a
            final = feature
    # Returns the subscript of the selected feature
    # sequence.append(final)
    # print("the final feature number selected on this node is:", final)
    featureSet.remove(final)
    child = [0, 0, 0, 0]
    child[0] = final
    child[1] = recursion(featureSet, set1, counter1)
    child[2] = recursion(featureSet, set2, counter2)
    child[3] = recursion(featureSet, set3, counter3)
    return child


# [8] Decision making
def judge(data, tree):
    root = "unknow"
    while (len(tree) > 0):
        if isinstance(tree, str) and tree in iris.target_names:
            return tree
        root = tree[0]
        if (isinstance(root, str)):
            return root
        if isinstance(root, int):
            if data[root] < razors[root][0] and tree[1] != []:
                tree = tree[1]
            elif tree[2] != [] and (tree[1] == [] or (data[root] >= razors[root][0] and data[root] <= razors[root][1])):
                tree = tree[2]
            else:
                tree = tree[3]
    return root



# [9] Call
if __name__ == '__main__':
    iris = datasets.load_iris()
    num = [0, 0, 0]
    for row in iris.data:
        num[int(row[-1])] = num[int(row[-1])] + 1
    length = len(iris.target)
    [trainData, testData] = divideData()
    razors = getRazors()
    tree = recursion(list(range(len(iris.feature_names))), trainData,
                     [np.sum(trainData[:, -1] == 0), np.sum(trainData[:, -1] == 1), np.sum(trainData[:, -1] == 2)])
    print("The tree constructed from the selected training set: ", tree)
    index = 0
    right = 0
    for data in testData:
        result = judge(testData[index], tree)
        truth = iris.target_names[int(testData[index][-1])]
        print("result is ", result, "  truth is ", truth)
        index = index + 1
        if result == truth:
            right = right + 1
    print("Accuracy: ", right / index)

5. References

          (1)Using random forest algorithm to realize iris case

          (2)Using random forest algorithm to realize iris case

          (3)Iris data classified by decision tree

          (4)sklearn tutorial of python

 

Published 81 original articles, won praise 12, visited 10000+
Private letter follow

Posted by Nameless12 on Tue, 03 Mar 2020 02:40:11 -0800