data processing


In [1]:
import pandas as pd

In [2]:
def loadData(datapath):
    # load data並進行 filter 只保留下面兩種格式的資料,其他有缺項的一律不考慮
    # 第一欄皆為 'M' 是個沒必要的feature故拿掉
    # {0 M,2 Gold,3 85,4 80000}
    # {0 M,       3 49,4 40000}
    with open (datapath,'r') as f:
        lines = f.readlines()
        filterL = []
        for line in lines:
            tfL = line.split()
            try:
                # feature:0
                tf0 = tfL[1].split(',')
                # feature:1
                tf1 = tfL[2].split(',')
                # feature:2
                tf2 = tfL[3].split(',')
                # feature:3
                tf3 = tfL[4].split(',')
                # feature:4
                tf4 = tfL[5].replace('}', '')
                if tf0[1]=='1' and tf1[1]=='2' and tf2[1]=='3' and tf3[1]=='4':
                    filterL.append([float(tf1[0]),tf2[0],float(tf3[0]),float(tf4)])
            except :
                pass

            try:
                # feature:0
                tf0 = tfL[1].split(',')
                # feature:1
                tf1 = tfL[2].split(',')
                # feature:3
                tf2 = tfL[3].split(',')
                # feature:4
                tf3 = tfL[4].replace('}', '')
                if tf0[1]=='1' and tf1[1]=='3' and tf2[1]=='4':
                    filterL.append([float(tf1[0]),'Basic',float(tf2[0]),float(tf3)])
            except :
                pass
    return filterL

In [3]:
datapath = '/Users/wy/Desktop/data mining/hw3-Q36034188/training.txt'
filterL =  loadData(datapath)

In [4]:
# rowname = ['marital_status','num_children_at_home','member_card','age','year_income']
rowname = ['num_children_at_home','member_card','age','year_income']
df = pd.DataFrame(filterL, columns=rowname)

In [5]:
# 簡單看一下data
df.head()


Out[5]:
num_children_at_home member_card age year_income
0 1 Gold 90 20000
1 2 Silver 58 40000
2 1 Basic 51 60000
3 2 Basic 21 80000
4 3 Gold 51 100000

In [6]:
# 看一下data的大概狀況
df.describe()


Out[6]:
num_children_at_home age year_income
count 171.000000 171.000000 171.000000
mean 1.830409 53.853801 65847.953216
std 1.023494 19.935683 37049.414283
min 1.000000 20.000000 20000.000000
25% 1.000000 36.000000 40000.000000
50% 1.000000 54.000000 60000.000000
75% 2.000000 69.500000 80000.000000
max 5.000000 90.000000 160000.000000

In [7]:
df.max()


Out[7]:
num_children_at_home         5
member_card             Silver
age                         90
year_income             160000
dtype: object

In [8]:
df.mean()


Out[8]:
num_children_at_home        1.830409
age                        53.853801
year_income             65847.953216
dtype: float64

In [9]:
def numericTra(feature,a,b,c):
    if feature > a:
        return 4
    elif a >= feature > b:
        return 3
    elif b>= feature > c:
        return 2
    elif c>= feature:
        return 1
# 離散化 把numeric分成四個等級 4 > (max+mean/2) > 3 > mean > 2 >(mean+min/2) > 1
def discretization(df,filterL):
#     import copy
#     new_list = copy.deepcopy(filterL)
    # mean
    num_children_at_homeMean = df.mean()[0]
    ageMean = df.mean()[1]
    year_incomeMean = df.mean()[2]
    # (mean+max)/2
    maxMean = (df.max()+df.mean())/2.
    num_children_at_homeMaxMean = maxMean[2]
    ageMaxMean = maxMean[0]
    year_incomeMaxMean = maxMean[3]
    # (mean+min)/2
    minMean = (df.mean()+df.min())/2.
    num_children_at_homeMinMean = minMean[2]
    ageMinMean = minMean[0]
    year_incomeMinMean = minMean[3]
    for ind in range(len(filterL)):
        filterL[ind][0] = numericTra(filterL[ind][0],num_children_at_homeMaxMean,num_children_at_homeMean,num_children_at_homeMinMean)
        filterL[ind][2] = numericTra(filterL[ind][2],ageMaxMean,year_incomeMean,ageMinMean)
        filterL[ind][3] = numericTra(filterL[ind][3],year_incomeMaxMean,year_incomeMean,year_incomeMinMean)
discretization(df,filterL)

In [10]:
# 把 label 移到feature最後
def createDataLabel(filterL):
    dataSet=[]
    for ind in range(len(filterL)):
        dataSet.append([filterL[ind][0],filterL[ind][2],filterL[ind][3],filterL[ind][1]])
    return dataSet

In [11]:
# 把data前處理包成一個function
def preprocessing(datapath):
    filterL = loadData(datapath)
    rowname = ['num_children_at_home','member_card','age','year_income']
    df = pd.DataFrame(filterL, columns=rowname)
    discretization(df,filterL)
    dataSet = createDataLabel(filterL)
    return dataSet

In [12]:
datapath = '/Users/wy/Desktop/data mining/hw3-Q36034188/training.txt'
dataSetHw3 = preprocessing(datapath)

In [13]:
dataSetHw3[:5]


Out[13]:
[[1, 4, 1, 'Gold'],
 [3, 2, 1, 'Silver'],
 [1, 2, 2, 'Basic'],
 [3, 1, 3, 'Basic'],
 [3, 2, 3, 'Gold']]

decision tree


In [14]:
import math
import operator

In [15]:
def createDataSet():
    dataSet = [['a', 1, 'yes'],
               ['a', 1, 'yes'],
               ['a', 0, 'no'],
               ['b', 1, 'no'],
               ['b', 1, 'no']]
    labels = ['no surfacing','flippers']
    #change to discrete values
    return dataSet, labels

In [16]:
dataSet, labels = createDataSet()

In [17]:
# 計算entropy
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet: 
        # 選擇 label
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys(): 
            labelCounts[currentLabel] = 0.
        labelCounts[currentLabel] += 1.
    shannonEnt = 0.
    for key in labelCounts:
        prob = labelCounts[key]/numEntries
        # -P(V1)*LOG2P(V)
        shannonEnt -= prob * math.log(prob,2) #log base 2
    return shannonEnt

In [18]:
calcShannonEnt(dataSet)


Out[18]:
0.9709505944546686

In [19]:
def splitDataSet(dataSet, axis, value):
    # axis 第幾個feature需要被切割 , value一樣才保留下來
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            # pass the axis
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

In [20]:
splitDataSet(dataSet,1,1)


Out[20]:
[['a', 'yes'], ['a', 'yes'], ['b', 'no'], ['b', 'no']]

In [21]:
splitDataSet(dataSet,1,0)


Out[21]:
[['a', 'no']]

In [22]:
dataSet


Out[22]:
[['a', 1, 'yes'],
 ['a', 1, 'yes'],
 ['a', 0, 'no'],
 ['b', 1, 'no'],
 ['b', 1, 'no']]

In [23]:
def chooseBestFeatureToSplit(dataSet):
    # 最後一欄為label
    numFeatures = len(dataSet[0]) - 1
    # 目前的baseEntropy
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):        #iterate over all the features
        # 同欄的feature併到同list
        featList = [example[i] for example in dataSet]
        # 取得該同list的feature集合
        uniqueVals = set(featList)       #get a set of unique values
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)     
        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
        if (infoGain > bestInfoGain):       #compare this to the best gain so far
            bestInfoGain = infoGain         #if better than current best, set to best
            bestFeature = i
    return bestFeature                      #returns an integer

In [24]:
bestFeature = chooseBestFeatureToSplit(dataSet)

In [25]:
bestFeature


Out[25]:
0

In [26]:
def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

def createTree(dataSet,labels):
    # 把dataSet的labels 存進 classList
    classList = [example[-1] for example in dataSet]
    # 當labels一樣時停止
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    
    # 當沒有feature可以劃分時停止
    if len(dataSet[0]) == 1: 
        # 挑選次數最多的label當作return value
        return majorityCnt(classList)
    # 選擇最佳切割的feature
    bestFeat = chooseBestFeatureToSplit(dataSet)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]       #copy all of labels, so trees don't mess up existing labels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
    return myTree

In [27]:
myTree = createTree(dataSet,labels)

In [28]:
myTree


Out[28]:
{'no surfacing': {'a': {'flippers': {0: 'no', 1: 'yes'}}, 'b': 'no'}}

In [29]:
def classify(inputTree,featLabels,testVec):
    firstStr = inputTree.keys()[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    key = testVec[featIndex]
    valueOfFeat = secondDict[key]
    if isinstance(valueOfFeat, dict): 
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else: classLabel = valueOfFeat
    return classLabel

In [30]:
dataSet, labels = createDataSet()

In [31]:
classify(myTree,labels,['a',0])


Out[31]:
'no'

load hw3 data


In [32]:
dataSetHw3[:5]


Out[32]:
[[1, 4, 1, 'Gold'],
 [3, 2, 1, 'Silver'],
 [1, 2, 2, 'Basic'],
 [3, 1, 3, 'Basic'],
 [3, 2, 3, 'Gold']]

In [33]:
labelsHw3 = ['num_children_at_home','age','year_income']

In [34]:
myTreeHw3 = createTree(dataSetHw3,labelsHw3)

In [35]:
# tree model 建立完成
myTreeHw3


Out[35]:
{'num_children_at_home': {1: {'year_income': {1: {'age': {1: 'Basic',
      2: 'Basic',
      4: 'Basic'}},
    2: {'age': {1: 'Normal', 2: 'Basic', 4: 'Basic'}},
    3: {'age': {1: 'Gold', 2: 'Basic', 4: 'Basic'}},
    4: {'age': {1: 'Basic', 2: 'Basic', 4: 'Silver'}}}},
  3: {'year_income': {1: {'age': {1: 'Gold', 2: 'Basic', 4: 'Gold'}},
    2: {'age': {1: 'Basic', 2: 'Gold', 4: 'Gold'}},
    3: {'age': {1: 'Gold', 2: 'Gold', 4: 'Basic'}},
    4: {'age': {1: 'Basic', 2: 'Basic', 4: 'Basic'}}}},
  4: {'year_income': {1: 'Gold',
    2: {'age': {'Basic': 'Basic', 'Silver': 'Silver'}},
    3: 'Basic',
    4: {'age': {2: 'Gold', 4: 'Gold'}}}}}}

In [36]:
# 載入 test data
testdatapath = '/Users/wy/Desktop/data mining/hw3-Q36034188/test.txt'
dataSetTestHw3 = preprocessing(datapath)

In [37]:
testFeature=[]
testAnswer=[]
for line in range(len(dataSetTestHw3)):
    testFeature.append(dataSetTestHw3[line][:3])
    testAnswer.append(dataSetTestHw3[line][-1])

In [38]:
testFeature[:5]


Out[38]:
[[1, 4, 1], [3, 2, 1], [1, 2, 2], [3, 1, 3], [3, 2, 3]]

In [39]:
testAnswer[:5]


Out[39]:
['Gold', 'Silver', 'Basic', 'Basic', 'Gold']

In [40]:
dataSetTestHw3[:5]


Out[40]:
[[1, 4, 1, 'Gold'],
 [3, 2, 1, 'Silver'],
 [1, 2, 2, 'Basic'],
 [3, 1, 3, 'Basic'],
 [3, 2, 3, 'Gold']]

In [41]:
labelsHw3 = ['num_children_at_home','age','year_income']

In [42]:
classificationsAnswer=[]
for f in testFeature:
    try:
        classificationsAnswer.append(classify(myTreeHw3,labelsHw3,f))
    except:
        classificationsAnswer.append('Basic')

In [43]:
num = len(testAnswer)
ans = 0
for a,b in zip(classificationsAnswer,testAnswer):
    if a==b:
        ans+=1
correctness = float(ans)/float(num)

In [44]:
# 準確率 0.56%
correctness


Out[44]:
0.5672514619883041

In [45]:
# 建立真實和分類器結果的table
GGold=0
GSilver=0
GNormal=0
GBasic=0

SGold=0
SSilver=0
SNormal=0
SBasic=0

NGold=0
NSilver=0
NNormal=0
NBasic=0

BGold=0
BSilver=0
BNormal=0
BBasic=0
for a,b in zip(classificationsAnswer,testAnswer):
    if b=='Gold':
        if a=='Gold':
            GGold+=1
        elif a=='Silver':
            GSilver+=1
        elif a=='Normal':
            GNormal+=1
        elif a=='Basic':
            GBasic+=1
    elif b=='Silver':
        if a=='Gold':
            SGold+=1
        elif a=='Silver':
            SSilver+=1
        elif a=='Normal':
            SNormal+=1
        elif a=='Basic':
            SBasic+=1       
    elif b=='Normal':
        if a=='Gold':
            NGold+=1
        elif a=='Silver':
            NSilver+=1
        elif a=='Normal':
            NNormal+=1
        elif a=='Basic':
            NBasic+=1         
    elif b=='Basic':
        if a=='Gold':
            BGold+=1
        elif a=='Silver':
            BSilver+=1
        elif a=='Normal':
            BNormal+=1
        elif a=='Basic':
            BBasic+=1  
corMatrix = [[GGold,GSilver,GNormal,GBasic],[SGold,SSilver,SNormal,SBasic],[NGold,NSilver,NNormal,NBasic],[BGold,BSilver,BNormal,BBasic]]
rowname = ['Gold','Silver','Normal','Basic']
indexname = ['True-Gold','True-Silver','True-Normal','True-Basic']
corMatrixdf = pd.DataFrame(corMatrix, index=indexname,columns=rowname)

In [46]:
corMatrixdf


Out[46]:
Gold Silver Normal Basic
True-Gold 31 1 0 17
True-Silver 3 1 0 14
True-Normal 6 0 1 19
True-Basic 13 1 0 64

In [ ]: