In [10]:
# import caffe
# normal distribution
from scipy.stats import norm
In [1]:
# set up Python environment: numpy for numerical routines, and matplotlib for plotting
import numpy as np
import matplotlib.pyplot as plt
# display plots in this notebook
%matplotlib inline
# set display defaults
plt.rcParams['figure.figsize'] = (10, 10) # large images
plt.rcParams['image.interpolation'] = 'nearest' # don't interpolate: show square pixels
plt.rcParams['image.cmap'] = 'gray' # use grayscale output rather than a (potentially misleading) color heatmap
import pandas as pd
In [57]:
names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','label']
trainData = pd.read_table('dataset/adult.data',names=names,sep=',',index_col=None)
testData = pd.read_table('dataset/adult.test',names=names,sep=',',index_col=None,skiprows=1)
In [55]:
testData.head()
Out[55]:
In [14]:
for i in trainData.iloc[0]:
print i
In [206]:
class Data:
def __init__(self,data,testData):
# data should be a table
self.trainData = data
self.testData = testData
# store all discrete attr (names)
self.discreteAttr = []
# the same as before(names)
self.continuousAttr = []
# positive trainData
self.positiveTrainData = data[data['label']==' >50K']
# negative trainData
self.negativeTrainData = data[data['label']==' <=50K']
# length
self.length = len(data)
# negative samples
self.nLength = len(self.negativeTrainData)
# positive samples
self.pLength = self.length - self.nLength
# positive probability
self.positiveProbability = (float(self.pLength)+1)/(self.length + 2) #Laplacian correction
# negativeProbability
self.negativeProbability = 1 - self.positiveProbability
# probability[attrValue][label]
self.probability = {}
# stores something like {'attrName':[[positivemean,positivestd],[negativemean,negativestd]]} for continuous attributions
self.continuousMessage={}
# all attributions value
# attr={'attrName':['all attr values(string)'],}
self.attrValues = {}
def initAttr(self):
# get one sample from data
one = self.trainData.loc[0]
# names of all attributions
names = one.index
# search for all continuous attributions and discretization
# for discrete attributions convert string to index
for i in range(names.size-1):
# check if it is descrete
if(type(one[i])==str):
self.discreteAttr.append(names[i])
else:
self.continuousAttr.append(names[i])
# for continuous attr and unnecesary to calculate its probability(lazy calculate)
# So just store attrName , mean and std into continueMessage
def discretization(self):
# for positive trainData
# and for negative trainData
for attr in self.continuousAttr:
_pdata = self.positiveTrainData[attr]
p_u = _pdata.mean()
p_d = _pdata.std()
_ndata = self.negativeTrainData[attr]
n_u = _ndata.mean()
n_d = _ndata.std()
self.continuousMessage[attr]=[[p_u,p_d],[n_u,n_d]]
# for one sample owns continuous atr
# def discretizationOne(self,continuousAttr):
# convert discrete attr to number
# def discreteToNumer(self):
# pass
# get self.attrValues
def discreteHandle(self):
for attr in self.discreteAttr:
self.attrValues[attr] = self.trainData[attr].unique()
# def continuousHandle(self):
# discretization()
# for attr in continuousAttr:
# self.attrValues[attr] = self.trainData[attr].unique()
# just for discrete attribution
def calculateProbabilityAndSave(self):
attrValues = self.attrValues
for key in self.discreteAttr:
for attrValue in attrValues[key]:
# 0 for positive and 1 for negative
# Laplacian correction
size = attrValues[key].size
# nsize = self.negativeTrainData[key].unique().size
# psize = size - nsize
p2 = (self.negativeTrainData[self.negativeTrainData[key]==attrValue].size+1)/(float(self.nLength)+size)
p1 = (self.positiveTrainData[self.positiveTrainData[key]==attrValue].size+1)/(float(self.pLength)+size)
self.probability[attrValue] = [p1,p2]
def trainDataHandle(self):
# fisrt get continueAttr and discreteAttr
self.initAttr()
# then handle discreteAttr and calculate probability of them
self.discreteHandle()#first get all attrValues in each attribution for discreteAttr
self.calculateProbabilityAndSave()#then get probability[attrValues][label] = pro
# And then handle continuousAttr(just calculate their mean and std)
self.discretization()
# sample is a series
def predict(self,sample):
# proMax = 0
positiveProbability = self.positiveProbability
p1 = positiveProbability
p2 = 1 - positiveProbability
# for discreteAttr
# for attrValue in sample:
# # check if it is discreteValue and label should not be calculated(exclude label)
# if(type(attrValue)!=str or attrValue==sample['label']):
# continue
for attr in self.discreteAttr:
attrValue = sample[attr]
p1 *=self.probability[attrValue][0]
p2 *=self.probability[attrValue][1]
# for continuousAttr
for attr in self.continuousAttr:
p1 *=norm.pdf(sample[attr],self.continuousMessage[attr][0][0],self.continuousMessage[attr][0][1])
p2 *=norm.pdf(sample[attr],self.continuousMessage[attr][1][0],self.continuousMessage[attr][1][1])
if(p1>p2):
return ' >50K',p1
else:
return ' <=50K',p1
def testDataHandle(self,iterator=None):
if(iterator==None):
iterator=len(self.testData)
error = 0
for i in range(iterator):
pred,p1 = self.predict(self.testData.iloc[i])
# print pred
# print(' >50K: ',p1)
pred+='.'
if(pred!=self.testData.iloc[i]['label']):
error+=1
errorRate = float(error)/iterator
print('error rate: ',errorRate)
return errorRate
def debug(self):
print('positive sample size: ',self.pLength)
print('negative sample size: ',self.nLength)
# print continuousAttr
print('continueAttr: ')
for attr in self.continuousAttr:
print(attr+' ')
print('\n')
# print dsicreteAttr
print('discreteAttr: ')
for attr in self.discreteAttr:
print(attr+' ')
print('positiveProbability: ',self.positiveProbability)
print('negativeProbability: ',self.negativeProbability)
# print continuousMessage
print('continuousMessage: ')
for attr in self.continuousAttr:
print(self.continuousMessage[attr])
print('\n')
# just calculate the probability of one sample(for example the first)
sample = trainData.iloc[0]
p2 = self.positiveProbability
print p2
# for discreteAttr
# for key in self.attrValues:
# for attrValue in sample:
# if(type(attrValue)!=str or attrValue==sample['label']):
# continue
# p2*=self.negativeTrainData[self.negativeTrainData[key]==attrValue].size/self.nLength
# print(key,' ',p2)
for attr in self.attrValues:
p2*=len(self.negativeTrainData[self.negativeTrainData[attr]==sample[attr]])/float(self.nLength)
# for continuousAttr
for attr in self.continuousAttr:
p2 *=norm.pdf(sample[attr],self.continuousMessage[attr][1][0],self.continuousMessage[attr][1][1])
print('p2: ',p2)
In [207]:
data = Data(trainData,testData)
data.trainDataHandle()
In [208]:
data.debug()
In [209]:
# trainData.head(1)
data.testDataHandle()
Out[209]:
In [99]:
t = (1,2)
print(t)
In [153]:
negativeTrainData = trainData[trainData['label']==' <=50K']
sample = trainData.iloc[0]
float(negativeTrainData[negativeTrainData['workclass']==sample['workclass']].size)/negativeTrainData.size
Out[153]:
In [135]:
print negativeTrainData.size,negativeTrainData[negativeTrainData['workclass']==sample['workclass']].size
In [157]:
type(sample.index)
Out[157]:
In [164]:
def d(a=1):
print a
d()
In [201]:
# testData.size
a = testData.head()
len(a)
Out[201]:
In [ ]: