In [ ]:
# Try HiCS compare it with LOF and PCA
In [49]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
from scipy import stats
import itertools
import random
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler, label_binarize
from sklearn import metrics
In [32]:
df = pd.read_csv('PCA_test.csv')
In [33]:
df.head()
Out[33]:
In [34]:
# Find columns with NaN
pd.isnull(df).sum()>0
Out[34]:
In [35]:
# replace NaN with specific values
df['Item_Weight'].fillna(0, inplace=True)
df['Outlet_Size'].fillna('MISSING', inplace=True)
In [36]:
df.head()
Out[36]:
In [37]:
df.dtypes
Out[37]:
In [38]:
# Remove identifier
df = df.drop('Item_Identifier', axis=1)
# Convert Categorical to Numerical
df.Item_Fat_Content = pd.Categorical(df.Item_Fat_Content)
df['Item_Fat_Content'] = df.Item_Fat_Content.cat.codes
df.Item_Type = pd.Categorical(df.Item_Type)
df['Item_Type'] = df.Item_Type.cat.codes
df.Outlet_Identifier = pd.Categorical(df.Outlet_Identifier)
df['Outlet_Identifier'] = df.Outlet_Identifier.cat.codes
df.Outlet_Size = pd.Categorical(df.Outlet_Size)
df['Outlet_Size'] = df.Outlet_Size.cat.codes
df.Outlet_Location_Type = pd.Categorical(df.Outlet_Location_Type)
df['Outlet_Location_Type'] = df.Outlet_Location_Type.cat.codes
df.Outlet_Type = pd.Categorical(df.Outlet_Type)
df['Outlet_Type'] = df.Outlet_Type.cat.codes
df.Outlet_Establishment_Year = pd.Categorical(df.Outlet_Establishment_Year)
df['Outlet_Establishment_Year'] = df.Outlet_Establishment_Year.cat.codes
In [39]:
df.head()
Out[39]:
In [40]:
# Calculate LOF
def knn(df,k):
nbrs = NearestNeighbors(n_neighbors=3)
nbrs.fit(df)
distances, indices = nbrs.kneighbors(df)
return distances, indices
def reachDist(df,MinPts,knnDist):
nbrs = NearestNeighbors(n_neighbors=MinPts)
nbrs.fit(df)
distancesMinPts, indicesMinPts = nbrs.kneighbors(df)
distancesMinPts[:,0] = np.amax(distancesMinPts,axis=1)
distancesMinPts[:,1] = np.amax(distancesMinPts,axis=1)
distancesMinPts[:,2] = np.amax(distancesMinPts,axis=1)
return distancesMinPts, indicesMinPts
def ird(MinPts,knnDistMinPts):
return (MinPts/np.sum(knnDistMinPts,axis=1))
def lof(Ird,MinPts,dsts):
lof=[]
for item in dsts:
tempIrd = np.divide(Ird[item[1:]],Ird[item[0]])
lof.append(tempIrd.sum()/MinPts)
return lof
# Create possible subspaces in each step
def comboGenerator(startPoint,space,n):
combosFinal=[]
for item in itertools.combinations(list(set(space)-set(startPoint)),(n-len(startPoint))):
combosFinal.append(sorted(startPoint+list(item)))
return combosFinal
In [41]:
# Calculate the index, we use this for selecting random sections in subspace
## and scaling
index_df = (df.rank()/df.rank().max()).iloc[:,:-1]
print index_df.head()
In [28]:
df['Outlet_Type'] = df['Outlet_Type'].astype('category')
In [42]:
df.dtypes
Out[42]:
In [43]:
# Start with 2-D subspaces
listOfCombos = comboGenerator([],df.columns[:-1],2)
testedCombos=[]
selection=[]
# Calculate the contrast score for each subspace
# For each subspace that satisfies the cut_off point criteria, add additional dimensions
while(len(listOfCombos)>0):
if listOfCombos[0] not in testedCombos:
alpha1 = pow(0.2,(float(1)/float(len(listOfCombos[0]))))
pvalue_Total =0
pvalue_cnt = 0
avg_pvalue=0
for i in range(0,50):
lband = random.random()
uband = lband+alpha1
v = random.randint(0,(len(listOfCombos[0])-1))
rest = list(set(listOfCombos[0])-set([listOfCombos[0][v]]))
k=stats.ks_2samp(df[listOfCombos[0][v]].values, df[((index_df[rest]<uband) & (index_df[rest]>lband)).all(axis=1)][listOfCombos[0][v]].values)
if not(np.isnan(k.pvalue)):
pvalue_Total = pvalue_Total+k.pvalue
pvalue_cnt = pvalue_cnt+1
if pvalue_cnt>0:
avg_pvalue = pvalue_Total/pvalue_cnt
if (1.0-avg_pvalue)>0.75:
selection.append(listOfCombos[0])
listOfCombos = listOfCombos + comboGenerator(listOfCombos[0],df.columns[:-1],(len(listOfCombos[0])+1))
testedCombos.append(listOfCombos[0])
listOfCombos.pop(0)
listOfCombos = [list(t) for t in set(map(tuple,listOfCombos))]
else:
listOfCombos.pop(0)
In [24]:
# selected feature combinations
print selection[7]
In [47]:
# check unique values in a column
df.Outlet_Type.unique()
Out[47]:
In [50]:
# Calculate the contrast score 50 times for each subspace, average the contrast scores from iterations
scoresList=[]
for item in selection:
m=50
knndist, knnindices = knn(df[item],3)
reachdist, reachindices = reachDist(df[item],m,knndist)
irdMatrix = ird(m,reachdist)
lofScores = lof(irdMatrix,m,reachindices)
scoresList.append(lofScores)
# Calculate average LOF score for each data point from each subspace
avgs = np.nanmean(np.ma.masked_invalid(np.array(scoresList)),axis=0)
# Scale the results to 0,1 range
scaled_avgs = MinMaxScaler().fit_transform(avgs.reshape(-1,1))
# Here is the AUC score from HiCS
print "HCiS AUC Score"
print metrics.roc_auc_score(label_binarize(df['Outlet_Type'], classes=[0, 1, 2, 3]),scaled_avgs) # CHANGE to your label name
# Got
# HCiS AUC Score: 0.605753326865
In [52]:
# Calculate LOF AUC
m=50
knndist, knnindices = knn(df.iloc[:,:-1],3)
reachdist, reachindices = reachDist(df.iloc[:,:-1],m,knndist)
irdMatrix = ird(m,reachdist)
lofScores = lof(irdMatrix,m,reachindices)
ss=MinMaxScaler().fit_transform(np.array(lofScores).reshape(-1,1))
print "LOF AUC Score"
print metrics.roc_auc_score(label_binarize(df['Outlet_Type'], classes=[0, 1, 2, 3]),ss) # CHANGE to your label name