# python implementation of iris data set for decision tree classification

The number of lines of code is 230. Because the training set selected for each execution of code is different, the accuracy of each execution is also different. The best case is that the accuracy reaches 83%.

##### The idea of eigenvalue discretization:

Since the final classification is divided into three categories, i guess the value of each feature can also be divided into three intervals, that is, to find two segmentation values. To find the partition value, we use the double-layer for loop to find the subscripts i and j which make the information entropy minimum.

##### Code overall idea:

1. Process the data first, and shuffle function randomly selects 80% of the samples as the training set.
2. Eigenvalue discretization
3. Use information entropy to construct tree recursively
4. Use the constructed tree to judge the remaining 20% of the test set, and find the accuracy of the algorithm for classification

``````# coding: utf-8

# In:

from sklearn import datasets
import math
import numpy as np

# In:

def getInformationEntropy(arr,leng):
#print("length = ",leng)
return -(arr/leng*math.log(arr/leng if arr>0 else 1)+              arr/leng*math.log(arr/leng if arr>0 else 1)+              arr/leng*math.log(arr/leng if arr>0 else 1))

#informationEntropy = getInformationEntropy(num,length)
#print(informationEntropy)

# In:

#The value of discrete characteristic one
def discretization(index):

feature1 = np.array([iris.data[:,index],iris.target]).T
feature1 = feature1[feature1[:,0].argsort()]

counter1 = np.array([0,0,0])
counter2 = np.array([0,0,0])

resEntropy = 100000
for i in range(len(feature1[:,0])):

counter1[int(feature1[i,1])] = counter1[int(feature1[i,1])] + 1
counter2 = np.copy(counter1)

for j in range(i+1,len(feature1[:,0])):

counter2[int(feature1[j,1])] =  counter2[int(feature1[j,1])] + 1
#print(i,j,counter1,counter2)
#Greedy algorithm for optimal cutting point
if i != j and j != len(feature1[:,0])-1:

#print(counter1,i+1,counter2-counter1,j-i,np.array(num)-counter2,length-j-1)

sum = (i+1)*getInformationEntropy(counter1,i+1) +                 (j-i)*getInformationEntropy(counter2-counter1,j-i) +                 (length-j-1)*getInformationEntropy(np.array(num)-counter2,length-j-1)
if sum < resEntropy:
resEntropy = sum
res = np.array([i,j])
res_value = [feature1[res,0],feature1[res,0]]
print(res,resEntropy,res_value)
return res_value

# In:

#Find the appropriate segmentation value
def getRazors():
a = []
for i in range(len(iris.feature_names)):
print(i)
a.append(discretization(i))

return np.array(a)

# In:

#Randomly select 80% of training sets and 20% of test sets
def divideData():
completeData = np.c_[iris.data,iris.target.T]
np.random.shuffle(completeData)
trainData = completeData[range(int(length*0.8)),:]
testData = completeData[range(int(length*0.8),length),:]
return [trainData,testData]

# In:

def getEntropy(counter):

res = 0
denominator = np.sum(counter)
if denominator == 0:
return 0
for value in counter:
if value == 0:
continue
res += value/denominator * math.log(value/denominator if value>0 and denominator>0 else 1)
return -res

# In:

def findMaxIndex(dataSet):
maxIndex = 0
maxValue = -1
for index,value in enumerate(dataSet):
if value>maxValue:
maxIndex = index
maxValue = value
return maxIndex

# In:

def recursion(featureSet,dataSet,counterSet):
#print("function start, remaining features:", featureSet, "remaining result length:", len(dataSet))

if(counterSet==0 and counterSet==0 and counterSet!=0):
return iris.target_names
if(counterSet!=0 and counterSet==0 and counterSet==0):
return iris.target_names
if(counterSet==0 and counterSet!=0 and counterSet==0):
return iris.target_names

if len(featureSet) == 0:
return iris.target_names[findMaxIndex(counterSet)]
if len(dataSet) == 0:
return []

res = 1000
final = 0
#print("number of remaining features", len(featureSet))
for feature in featureSet:
i = razors[feature]
j = razors[feature]
#print("i = ",i," j = ",j)
set1 = []
set2 = []
set3 = []
counter1 = [0,0,0]
counter2 = [0,0,0]
counter3 = [0,0,0]
for data in dataSet:
index = int(data[-1])
#print("data ",data," index ",index)

if data[feature]< i :
set1.append(data)
counter1[index] = counter1[index]+1
elif data[feature] >= i and data[feature] <=j:
set2.append(data)
counter2[index] = counter2[index]+1
else:
set3.append(data)
counter3[index] = counter3[index]+1

a =( len(set1)*getEntropy(counter1) +         len(set2)*getEntropy(counter2) +         len(set3)*getEntropy(counter3) )/ len(dataSet)

#print("feature No.:, feature," entropy of information obtained by selecting this feature: ", a)
if a<res :
res = a
final = feature

#Returns the subscript of the selected feature
#sequence.append(final)
#print("the final feature number selected on this node is:", final)
featureSet.remove(final)
child = [0,0,0,0]
child = final
child = recursion(featureSet,set1,counter1)
child = recursion(featureSet,set2,counter2)
child = recursion(featureSet,set3,counter3)

return child

# In:

def judge(data,tree):

root = "unknow"
while(len(tree)>0):
if isinstance(tree,str) and tree in iris.target_names:
return tree
root = tree
if(isinstance(root,str)):
return root

if isinstance(root,int):
if data[root]<razors[root] and tree != [] :
tree = tree
elif tree != [] and (tree==[] or (data[root]>=razors[root] and data[root]<=razors[root])):
tree = tree
else :
tree = tree
return root

# In:

if __name__ == '__main__':

num = [0,0,0]
for row in iris.data:
num[int(row[-1])] = num[int(row[-1])] + 1

length = len(iris.target)
[trainData,testData] = divideData()

razors = getRazors()

tree = recursion(list(range(len(iris.feature_names))),           trainData,[np.sum(trainData[:,-1]==0),            np.sum(trainData[:,-1]==1),np.sum(trainData[:,-1]==2)])
print("The tree constructed from the selected training set: ",tree)
index = 0
right = 0
for data in testData:
result = judge(testData[index],tree)
truth = iris.target_names[int(testData[index][-1])]

print("result is ",result ,"  truth is ",truth)
index = index + 1
if result == truth:
right = right + 1
print("Accuracy: ",right/index)
``````

Posted by duall on Sun, 01 Dec 2019 19:36:18 -0800