The decision tree picks the watermelon

Keywords: Python Algorithm Decision Tree

1, ID3 algorithm

1. Ladle

import numpy as np
import pandas as pd
import sklearn.tree as st
import math
import matplotlib
import os
import matplotlib.pyplot as plt

2. Read data

data = pd.read_csv('C:/Watermelon dataset.csv',header=None)

3. Coding


def calcEntropy(dataSet):
    mD = len(dataSet)
    dataLabelList = [x[-1] for x in dataSet]
    dataLabelSet = set(dataLabelList)
    ent = 0
    for label in dataLabelSet:
        mDv = dataLabelList.count(label)
        prop = float(mDv) / mD
        ent = ent - prop * np.math.log(prop, 2)

    return ent

Split dataset

def splitDataSet(dataSet, index, feature):
    splitedDataSet = []
    mD = len(dataSet)
    for data in dataSet:
        if(data[index] == feature):
            sliceTmp = data[:index]
            sliceTmp.extend(data[index + 1:])
    return splitedDataSet

Optimal feature

def chooseBestFeature(dataSet):
    entD = calcEntropy(dataSet)
    mD = len(dataSet)
    featureNumber = len(dataSet[0]) - 1
    maxGain = -100
    maxIndex = -1
    for i in range(featureNumber):
        entDCopy = entD
        featureI = [x[i] for x in dataSet]
        featureSet = set(featureI)
        for feature in featureSet:
            splitedDataSet = splitDataSet(dataSet, i, feature)  # Split dataset
            mDv = len(splitedDataSet)
            entDCopy = entDCopy - float(mDv) / mD * calcEntropy(splitedDataSet)
        if(maxIndex == -1):
            maxGain = entDCopy
            maxIndex = i
        elif(maxGain < entDCopy):
            maxGain = entDCopy
            maxIndex = i

    return maxIndex

Find up to tags

def mainLabel(labelList):
    labelRec = labelList[0]
    maxLabelCount = -1
    labelSet = set(labelList)
    for label in labelSet:
        if(labelList.count(label) > maxLabelCount):
            maxLabelCount = labelList.count(label)
            labelRec = label
    return labelRec


def createFullDecisionTree(dataSet, featureNames, featureNamesSet, labelListParent):
    labelList = [x[-1] for x in dataSet]
    if(len(dataSet) == 0):
        return mainLabel(labelListParent)
    elif(len(dataSet[0]) == 1): #There are no separable properties
        return mainLabel(labelList)  #Select the most label as the label of the dataset
    elif(labelList.count(labelList[0]) == len(labelList)): # All belong to the same Label
        return labelList[0]

    bestFeatureIndex = chooseBestFeature(dataSet)
    bestFeatureName = featureNames.pop(bestFeatureIndex)
    myTree = {bestFeatureName: {}}
    featureList = featureNamesSet.pop(bestFeatureIndex)
    featureSet = set(featureList)
    for feature in featureSet:
        featureNamesNext = featureNames[:]
        featureNamesSetNext = featureNamesSet[:][:]
        splitedDataSet = splitDataSet(dataSet, bestFeatureIndex, feature)
        myTree[bestFeatureName][feature] = createFullDecisionTree(splitedDataSet, featureNamesNext, featureNamesSetNext, labelList)
    return myTree


def readWatermelonDataSet():
    dataSet = data.values.tolist()
    featureNames =['color and lustre', 'Root', 'Knock', 'texture', 'Umbilicus', 'Tactile sensation']
    #Get featureNamesSet
    featureNamesSet = []
    for i in range(len(dataSet[0]) - 1):
        col = [x[i] for x in dataSet]
        colSet = set(col)
    return dataSet, featureNames, featureNamesSet

matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.serif'] = ['SimHei']

decisionNode = dict(boxstyle="sawtooth", fc="0.8")

leafNode = dict(boxstyle="round4", fc="0.8")

arrow_args = dict(arrowstyle="<-")

def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
                            xytext=centerPt, textcoords='axes fraction',
                            va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)

def getNumLeafs(myTree):
    numLeafs = 0

    firstStr = list(myTree.keys())[0]

    secondDict = myTree[firstStr]

    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            numLeafs += getNumLeafs(secondDict[key])
            numLeafs += 1
    return numLeafs

def getTreeDepth(myTree):
    maxDepth = 0

    firstStr = list(myTree.keys())[0]

    secondDic = myTree[firstStr]

    for key in secondDic.keys():
        if type(secondDic[key]).__name__ == 'dict':
            thisDepth = 1 + getTreeDepth(secondDic[key])

            thisDepth = 1

        if thisDepth > maxDepth:
            maxDepth = thisDepth

    return maxDepth

def plotMidText(cntrPt, parentPt, txtString):
    xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
    yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString)

def plotTree(myTree, parentPt, nodeTxt):
    numLeafs = getNumLeafs(myTree=myTree)

    depth = getTreeDepth(myTree=myTree)

    firstStr = list(myTree.keys())[0]

    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)

    plotMidText(cntrPt, parentPt, nodeTxt)

    plotNode(firstStr, cntrPt, parentPt, decisionNode)

    secondDict = myTree[firstStr]

    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD

    for key in secondDict.keys():
        if isinstance(secondDict[key], dict):
            plotTree(secondDict[key], cntrPt, str(key))
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))

    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD

def createPlot(inTree):
    fig = plt.figure(1, facecolor='white')
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5/plotTree.totalW
    plotTree.yOff = 1.0
    plotTree(inTree, (0.5, 1.0), '')

dataSet, featureNames, featureNamesSet=readWatermelonDataSet()
testTree= createFullDecisionTree(dataSet, featureNames, featureNamesSet,featureNames)


2, Implementation of sklearn ID3 and CART algorithm


import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv('C:/Watermelon dataset.csv',header=None)

label = LabelEncoder()    

for col in data[data.columns[:-1]]:
    data[col] = label.fit_transform(data[col])

# Fit with ID3
dtc = DecisionTreeClassifier(criterion='entropy')
# Fit[:,:-1].values.tolist(),data.iloc[:,-1].values) 
# Tag corresponding code
result = dtc.predict([[0,0,0,0,0,0]])
#Fitting results


# Fit with CART
dtc = DecisionTreeClassifier()
# Fit[:,:-1].values.tolist(),data.iloc[:,-1].values) 
# Tag corresponding code
result = dtc.predict([[0,0,0,1,0,0]])
#Fitting results

3, Summary

1.ID3 algorithm

By calculating the information gain of each attribute, ID3 algorithm considers that the attribute with high information gain is a good attribute. Each division selects the attribute with the highest information gain as the division standard, and repeats this process until a decision tree that can perfectly classify training samples is generated.
Decision tree is used to classify data to achieve the purpose of prediction. The decision tree method first forms a decision tree according to the training set data. If the tree can not give correct classification to all objects, select some exceptions to the training set data and repeat the process until a correct decision set is formed. Decision tree represents the tree structure of decision set.
The decision tree consists of decision nodes, branches and leaves. The top node in the decision tree is the root node, and each branch is a new decision node or a leaf of the tree. Each decision node represents a problem or decision, which usually corresponds to the attributes of the object to be classified. Each leaf node represents a possible classification result. In the process of traversing from top to bottom along the decision tree, a test will be encountered at each node. Different test outputs of problems on each node will lead to different branches, and finally reach a leaf node. This process is the process of classification using the decision tree and using several variables to judge the category.

2.CART algorithm

(1) Select an independent variable, and then select a value to divide the dimensional space into two parts. All points of one part are satisfied, and all points of the other part are satisfied. For discontinuous variables, there are only two values of attribute value, that is, equal to or not equal to the value.

(2) Recursive processing, re select an attribute from the above two parts according to step (1) and continue to divide until the whole dimensional space is divided.

Posted by CWebguy on Sat, 06 Nov 2021 10:26:57 -0700