In [10]:
# import caffe
# normal distribution
from scipy.stats import norm

In [1]:
# set up Python environment: numpy for numerical routines, and matplotlib for plotting
import numpy as np
import matplotlib.pyplot as plt
# display plots in this notebook
%matplotlib inline

# set display defaults
plt.rcParams['figure.figsize'] = (10, 10)        # large images
plt.rcParams['image.interpolation'] = 'nearest'  # don't interpolate: show square pixels
plt.rcParams['image.cmap'] = 'gray'  # use grayscale output rather than a (potentially misleading) color heatmap
import pandas as pd

In [57]:
names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','label']
trainData = pd.read_table('dataset/adult.data',names=names,sep=',',index_col=None)
testData = pd.read_table('dataset/adult.test',names=names,sep=',',index_col=None,skiprows=1)

In [55]:
testData.head()


Out[55]:
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country label
0 25 Private 226802 11th 7 Never-married Machine-op-inspct Own-child Black Male 0 0 40 United-States <=50K.
1 38 Private 89814 HS-grad 9 Married-civ-spouse Farming-fishing Husband White Male 0 0 50 United-States <=50K.
2 28 Local-gov 336951 Assoc-acdm 12 Married-civ-spouse Protective-serv Husband White Male 0 0 40 United-States >50K.
3 44 Private 160323 Some-college 10 Married-civ-spouse Machine-op-inspct Husband Black Male 7688 0 40 United-States >50K.
4 18 ? 103497 Some-college 10 Never-married ? Own-child White Female 0 0 30 United-States <=50K.

In [14]:
for i in trainData.iloc[0]:
    print i


39
 State-gov
77516
 Bachelors
13
 Never-married
 Adm-clerical
 Not-in-family
 White
 Male
2174
0
40
 United-States
 <=50K

In [206]:
class Data:
    
    def __init__(self,data,testData):
#         data should be a table
        self.trainData = data
        self.testData = testData
#     store all discrete attr (names)
        self.discreteAttr = []
#     the same as before(names)
        self.continuousAttr = []
    #         positive trainData
        self.positiveTrainData = data[data['label']==' >50K']
#         negative trainData
        self.negativeTrainData = data[data['label']==' <=50K']
#     length
        self.length = len(data)
#     negative samples
        self.nLength = len(self.negativeTrainData)
#     positive samples
        self.pLength = self.length - self.nLength
#     positive probability
        self.positiveProbability = (float(self.pLength)+1)/(self.length + 2) #Laplacian correction
#         negativeProbability
        self.negativeProbability = 1 - self.positiveProbability

    
#         probability[attrValue][label]
        self.probability = {}
#     stores something like {'attrName':[[positivemean,positivestd],[negativemean,negativestd]]} for continuous attributions
        self.continuousMessage={}
    
#     all attributions value
# attr={'attrName':['all attr values(string)'],}
        self.attrValues = {}


    def initAttr(self):
#         get one sample from data
        one = self.trainData.loc[0]

#      names of all attributions
        names = one.index
#         search for all continuous attributions and discretization
#         for discrete attributions convert string to index
        for i in range(names.size-1):
#             check if it is descrete
            if(type(one[i])==str):
        
                self.discreteAttr.append(names[i])
            else:
                self.continuousAttr.append(names[i])

# for continuous attr and unnecesary to calculate its probability(lazy calculate)
# So just store attrName , mean and std into  continueMessage
    def discretization(self):
#         for positive trainData
#         and for negative trainData
        for attr in self.continuousAttr:
            _pdata = self.positiveTrainData[attr]
            
            p_u = _pdata.mean()
            p_d = _pdata.std()
            
            _ndata = self.negativeTrainData[attr]
            
            n_u = _ndata.mean()
            n_d = _ndata.std()
            
            self.continuousMessage[attr]=[[p_u,p_d],[n_u,n_d]]
            

            
            
# for one sample owns continuous atr
#     def discretizationOne(self,continuousAttr):
        
            
    
#     convert discrete attr to number
#     def discreteToNumer(self):
#         pass
    
#     get self.attrValues
    def discreteHandle(self):
        for attr in self.discreteAttr:
            self.attrValues[attr] = self.trainData[attr].unique()
    
#     def continuousHandle(self):
        
#         discretization()
#         for attr in continuousAttr:
#             self.attrValues[attr] = self.trainData[attr].unique()
    
#     just for discrete attribution
    def calculateProbabilityAndSave(self):
        attrValues = self.attrValues
        for key in self.discreteAttr:
            for attrValue in attrValues[key]:
#                     0 for positive and 1 for negative
#                 Laplacian correction
                size = attrValues[key].size
#                 nsize = self.negativeTrainData[key].unique().size
#                 psize = size - nsize
                p2 = (self.negativeTrainData[self.negativeTrainData[key]==attrValue].size+1)/(float(self.nLength)+size)                 
                p1 = (self.positiveTrainData[self.positiveTrainData[key]==attrValue].size+1)/(float(self.pLength)+size)
                self.probability[attrValue] = [p1,p2]
    
    def trainDataHandle(self):
#         fisrt get continueAttr and discreteAttr
        self.initAttr()
    
#     then handle discreteAttr and calculate probability of them

        self.discreteHandle()#first get all attrValues in each attribution for discreteAttr
    
        self.calculateProbabilityAndSave()#then get probability[attrValues][label] = pro
        
#     And then handle continuousAttr(just calculate their mean and std)
        self.discretization()
    
#     sample is a series
    def predict(self,sample):
    #     proMax = 0
        positiveProbability = self.positiveProbability
        p1 = positiveProbability
        p2 = 1 - positiveProbability
#        for discreteAttr

#         for attrValue in sample:
# #         check if it is discreteValue and label should not be calculated(exclude label)
#             if(type(attrValue)!=str or attrValue==sample['label']):
#                 continue
        for attr in self.discreteAttr:
            attrValue = sample[attr]
            p1 *=self.probability[attrValue][0]
            p2 *=self.probability[attrValue][1]
#       for continuousAttr         
        for attr in self.continuousAttr:
            p1 *=norm.pdf(sample[attr],self.continuousMessage[attr][0][0],self.continuousMessage[attr][0][1])
            p2 *=norm.pdf(sample[attr],self.continuousMessage[attr][1][0],self.continuousMessage[attr][1][1])
        if(p1>p2):
            return ' >50K',p1
        else:
            return ' <=50K',p1
    def testDataHandle(self,iterator=None):
        if(iterator==None):
            iterator=len(self.testData)
        error = 0
        for i in range(iterator):
            pred,p1 = self.predict(self.testData.iloc[i])
#             print pred
#             print(' >50K: ',p1)
            pred+='.'
            if(pred!=self.testData.iloc[i]['label']):
                error+=1
        errorRate = float(error)/iterator
        print('error rate: ',errorRate)
        return errorRate
    
    def debug(self):
        print('positive sample size: ',self.pLength)
        print('negative sample size: ',self.nLength)
        
#         print continuousAttr
        print('continueAttr: ')
        for attr in self.continuousAttr:
            print(attr+' ')
        print('\n')
#         print dsicreteAttr
        print('discreteAttr: ')
        for attr in self.discreteAttr:
            print(attr+' ')

        print('positiveProbability: ',self.positiveProbability)
        print('negativeProbability: ',self.negativeProbability)
        
#         print continuousMessage
        print('continuousMessage: ')
        for attr in self.continuousAttr:
        
            print(self.continuousMessage[attr])
        print('\n')
#         just calculate the probability of one sample(for example the first)
        sample = trainData.iloc[0]

        p2 = self.positiveProbability
        print p2
#         for discreteAttr
#         for key in self.attrValues:
            
#             for attrValue in sample:
#                 if(type(attrValue)!=str or attrValue==sample['label']):
#                     continue
#                 p2*=self.negativeTrainData[self.negativeTrainData[key]==attrValue].size/self.nLength
#                 print(key,' ',p2)
        for attr in self.attrValues:
            p2*=len(self.negativeTrainData[self.negativeTrainData[attr]==sample[attr]])/float(self.nLength)

#         for continuousAttr
        for attr in self.continuousAttr:
            p2 *=norm.pdf(sample[attr],self.continuousMessage[attr][1][0],self.continuousMessage[attr][1][1])
        print('p2: ',p2)

In [207]:
data = Data(trainData,testData)
data.trainDataHandle()

In [208]:
data.debug()


('positive sample size: ', 7841)
('negative sample size: ', 24720)
continueAttr: 
age 
fnlwgt 
education-num 
capital-gain 
capital-loss 
hours-per-week 


discreteAttr: 
workclass 
education 
marital-status 
occupation 
relationship 
race 
sex 
native-country 
('positiveProbability: ', 0.24082547676811106)
('negativeProbability: ', 0.7591745232318889)
continuousMessage: 
[[44.24984058155847, 10.51902771985177], [36.78373786407767, 14.020088490824813]]
[[188005.0, 102541.77547230694], [190340.8651699029, 106482.27119468113]]
[[11.611656676444332, 2.3851286326651047], [9.595064724919094, 2.4361467923086515]]
[[4006.142456319347, 14570.378951280984], [148.75246763754046, 963.1393073648252]]
[[195.00153041703865, 595.4875739786504], [53.14292071197411, 310.7557690957039]]
[[45.473026399693914, 11.012970930209358], [38.840210355987054, 12.318994641854317]]


0.240825476768
('p2: ', 5.9866207016993444e-23)

In [209]:
# trainData.head(1)
data.testDataHandle()


('error rate: ', 0.16872427983539096)
Out[209]:
0.16872427983539096

In [99]:
t = (1,2)
print(t)


(1, 2)

In [153]:
negativeTrainData = trainData[trainData['label']==' <=50K']
sample = trainData.iloc[0]
float(negativeTrainData[negativeTrainData['workclass']==sample['workclass']].size)/negativeTrainData.size


Out[153]:
0.03822815533980582

In [135]:
print negativeTrainData.size,negativeTrainData[negativeTrainData['workclass']==sample['workclass']].size


370800 14175

In [157]:
type(sample.index)


Out[157]:
pandas.indexes.base.Index

In [164]:
def d(a=1):
    print a
d()


1

In [201]:
# testData.size
a = testData.head()
len(a)


Out[201]:
5

In [ ]: