In [1]:
import math
import numpy as np

In [2]:
# 關鍵字所屬分類      P     P    S    S     T    T
# 新聞       分類  賓士  寶馬  籃球  路跑  手機  App
# ----------------------------------------------
# C63發表會    P    15   25    0     5    8    3
# BMW i8      P    35   40    1     3    3    2
# 林書豪       S     5    0   35    50    0    0
# 湖人隊       S     1    5   32    15    0    0
# Android 5.0 T    10    5    7     0    2   30
# iPhone6     T     5    5    5    15    8   32

In [3]:
dataSet=[]
label=[]
lines =[['p',15,25,0,5,8,3],['p',35,40,1,3,3,2],['s',5,0,35,50,0,0],
        ['s',1,5,32,15,0,0],['t',10,5,7,0,2,30],['t',5,5,5,15,8,32]]
for index, line in enumerate(lines):
    label.append(line[0])
    dimension  = len(line[1:])
    dataSet.append(map(float,line[1:]))
dataSet = np.array(dataSet)

In [4]:
label


Out[4]:
['p', 'p', 's', 's', 't', 't']

In [5]:
dataSet


Out[5]:
array([[ 15.,  25.,   0.,   5.,   8.,   3.],
       [ 35.,  40.,   1.,   3.,   3.,   2.],
       [  5.,   0.,  35.,  50.,   0.,   0.],
       [  1.,   5.,  32.,  15.,   0.,   0.],
       [ 10.,   5.,   7.,   0.,   2.,  30.],
       [  5.,   5.,   5.,  15.,   8.,  32.]])

In [6]:
def Bayestrain(classifyRange):
    alltrainSample = dataSet.sum()
    classifyProbabilityList = []
    # P(分類) = 該分類下字詞頻率總和 / 所有訓練集合字詞頻率總和
    for key, value in classifyRange.iteritems() :
        tmp = dataSet[:, value[0] : value[1] ].sum()/alltrainSample
        classifyProbabilityList.append(tmp)
    # P(特徵關鍵字|分類) = (該分類下、該關鍵字字詞頻率總和 + 1) / (該分類下所有關鍵字字詞頻率總和 + 訓練集合關鍵字個數)
    featureProbabilityList=[]
    for key, value in classifyRange.iteritems() :
        t = []
        for a in range(dimension):
            featureProbability = (dataSet[value[0]:value[1]][:,a].sum()+1)/(dataSet[value[0]:value[1]].sum()+dimension)
            t.append(featureProbability)
        featureProbabilityList.append(t)
    return classifyProbabilityList,featureProbabilityList

In [7]:
def Bayespredict(classifyProbabilityList,featureProbabilityList,classifyRange,predictList):
    tmp = []
    for key, value in classifyRange.iteritems() :
        tmp.append(key)
    compareList = []
    for c,fList in zip(classifyProbabilityList,featureProbabilityList):
        c = math.log10(c)
        # 取 log 避免向下溢位情況發生
        for p,f in zip(predictList,fList):
            c += (p*math.log10(f))
        compareList.append(c)
    predict = tmp [compareList.index(max(compareList))]
    return predict

In [8]:
# 2,4為門檻值-關鍵字分類p [:2] 關鍵字分類s [2:4] 關鍵字分類t [4:dimension]
classifyRange = {'p':[0,2],'s':[2,4],'t':[4,6]}
classifyProbabilityList , featureProbabilityList = Bayestrain(classifyRange)

In [9]:
# classifyProbabilityList,featureProbabilityList 皆為train後的結果
# classifyRange 所設之門檻
# predictList 要預測的feature
predictList=[10,2,50,56,8,5]
predict = Bayespredict(classifyProbabilityList,featureProbabilityList,classifyRange,predictList)

In [10]:
predict


Out[10]:
's'