python implementation of iris data set for decision tree classification

The number of lines of code is 230. Because the training set selected for each execution of code is different, the accuracy of each execution is also different. The best case is that the accuracy reaches 83%.

The idea of eigenvalue discretization:

Since the final classification is divided into three categories, i guess the value of each feature can also be divided into three intervals, that is, to find two segmentation values. To find the partition value, we use the double-layer for loop to find the subscripts i and j which make the information entropy minimum.

Code overall idea:

1. Process the data first, and shuffle function randomly selects 80% of the samples as the training set.
2. Eigenvalue discretization
3. Use information entropy to construct tree recursively
4. Use the constructed tree to judge the remaining 20% of the test set, and find the accuracy of the algorithm for classification

# coding: utf-8

# In[1]:


from sklearn import datasets
import math
import numpy as np


# In[69]:


def getInformationEntropy(arr,leng):
    #print("length = ",leng)
    return -(arr[0]/leng*math.log(arr[0]/leng if arr[0]>0 else 1)+              arr[1]/leng*math.log(arr[1]/leng if arr[1]>0 else 1)+              arr[2]/leng*math.log(arr[2]/leng if arr[2]>0 else 1))

#informationEntropy = getInformationEntropy(num,length)
#print(informationEntropy)


# In[105]:


#The value of discrete characteristic one
def discretization(index):
    
    feature1 = np.array([iris.data[:,index],iris.target]).T
    feature1 = feature1[feature1[:,0].argsort()]

    counter1 = np.array([0,0,0])
    counter2 = np.array([0,0,0])
    
    resEntropy = 100000
    for i in range(len(feature1[:,0])):

        counter1[int(feature1[i,1])] = counter1[int(feature1[i,1])] + 1
        counter2 = np.copy(counter1)

        for j in range(i+1,len(feature1[:,0])):

            counter2[int(feature1[j,1])] =  counter2[int(feature1[j,1])] + 1
            #print(i,j,counter1,counter2)
            #Greedy algorithm for optimal cutting point
            if i != j and j != len(feature1[:,0])-1:

                #print(counter1,i+1,counter2-counter1,j-i,np.array(num)-counter2,length-j-1)

                sum = (i+1)*getInformationEntropy(counter1,i+1) +                 (j-i)*getInformationEntropy(counter2-counter1,j-i) +                 (length-j-1)*getInformationEntropy(np.array(num)-counter2,length-j-1)
                if sum < resEntropy:
                    resEntropy = sum
                    res = np.array([i,j])
    res_value = [feature1[res[0],0],feature1[res[1],0]]
    print(res,resEntropy,res_value)
    return res_value
            


# In[122]:


#Find the appropriate segmentation value
def getRazors():
    a = []
    for i in range(len(iris.feature_names)):
        print(i)
        a.append(discretization(i))

    return np.array(a)


# In[326]:


#Randomly select 80% of training sets and 20% of test sets
def divideData():
    completeData = np.c_[iris.data,iris.target.T]
    np.random.shuffle(completeData)
    trainData = completeData[range(int(length*0.8)),:]
    testData = completeData[range(int(length*0.8),length),:]
    return [trainData,testData]


# In[213]:


def getEntropy(counter):

    res = 0
    denominator = np.sum(counter)
    if denominator == 0:
        return 0
    for value in counter:
        if value == 0:
            continue
        res += value/denominator * math.log(value/denominator if value>0 and denominator>0 else 1)
    return -res



# In[262]:


def findMaxIndex(dataSet):
    maxIndex = 0
    maxValue = -1
    for index,value in enumerate(dataSet):
        if value>maxValue:
            maxIndex = index
            maxValue = value
    return maxIndex


# In[308]:


def recursion(featureSet,dataSet,counterSet):
    #print("function start, remaining features:", featureSet, "remaining result length:", len(dataSet))
    
    if(counterSet[0]==0 and counterSet[1]==0 and counterSet[2]!=0):
        return iris.target_names[2]
    if(counterSet[0]!=0 and counterSet[1]==0 and counterSet[2]==0):
        return iris.target_names[0]
    if(counterSet[0]==0 and counterSet[1]!=0 and counterSet[2]==0):
        return iris.target_names[1]
    
    if len(featureSet) == 0:
        return iris.target_names[findMaxIndex(counterSet)]
    if len(dataSet) == 0:
        return []
    
    res = 1000
    final = 0
    #print("number of remaining features", len(featureSet))
    for feature in featureSet:
        i = razors[feature][0]
        j = razors[feature][1]
        #print("i = ",i," j = ",j)
        set1 = []
        set2 = []
        set3 = []
        counter1 = [0,0,0]
        counter2 = [0,0,0]
        counter3 = [0,0,0]
        for data in dataSet:
            index = int(data[-1])
            #print("data ",data," index ",index)
            
            if data[feature]< i :
                set1.append(data)
                counter1[index] = counter1[index]+1
            elif data[feature] >= i and data[feature] <=j:
                set2.append(data)
                counter2[index] = counter2[index]+1
            else:
                set3.append(data)
                counter3[index] = counter3[index]+1

        
        a =( len(set1)*getEntropy(counter1) +         len(set2)*getEntropy(counter2) +         len(set3)*getEntropy(counter3) )/ len(dataSet)
  
        #print("feature No.:, feature," entropy of information obtained by selecting this feature: ", a)
        if a<res :
            res = a
            final = feature

    #Returns the subscript of the selected feature
    #sequence.append(final)
    #print("the final feature number selected on this node is:", final)        
    featureSet.remove(final)
    child = [0,0,0,0]
    child[0] = final
    child[1] = recursion(featureSet,set1,counter1)
    child[2] = recursion(featureSet,set2,counter2)
    child[3] = recursion(featureSet,set3,counter3)
    
    return child 

                 

# In[322]:



def judge(data,tree):

    root = "unknow"
    while(len(tree)>0):
        if isinstance(tree,str) and tree in iris.target_names:
            return tree
        root = tree[0]
        if(isinstance(root,str)):
            return root
        
        if isinstance(root,int):
            if data[root]<razors[root][0] and tree[1] != [] :
                tree = tree[1]
            elif tree[2] != [] and (tree[1]==[] or (data[root]>=razors[root][0] and data[root]<=razors[root][1])):
                tree = tree[2]
            else :
                tree = tree[3]
    return root            

# In[327]:


if __name__ == '__main__':
    
    iris = datasets.load_iris()
    num = [0,0,0]
    for row in iris.data:
        num[int(row[-1])] = num[int(row[-1])] + 1

    length = len(iris.target)
    [trainData,testData] = divideData()
    
    razors = getRazors()

    tree = recursion(list(range(len(iris.feature_names))),           trainData,[np.sum(trainData[:,-1]==0),            np.sum(trainData[:,-1]==1),np.sum(trainData[:,-1]==2)])
    print("The tree constructed from the selected training set: ",tree)
    index = 0
    right = 0
    for data in testData:
        result = judge(testData[index],tree)
        truth = iris.target_names[int(testData[index][-1])]
                       
        print("result is ",result ,"  truth is ",truth)
        index = index + 1
        if result == truth:
            right = right + 1
    print("Accuracy: ",right/index)

Posted by duall on Sun, 01 Dec 2019 19:36:18 -0800