In [ ]:
# Try HiCS compare it with LOF and PCA

In [49]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
from scipy import stats
import itertools
import random
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler, label_binarize
from sklearn import metrics

In [32]:
df = pd.read_csv('PCA_test.csv')

In [33]:
df.head()


Out[33]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type
0 FDW58 20.750 Low Fat 0.007565 Snack Foods 107.8622 OUT049 1999 Medium Tier 1 Supermarket Type1
1 FDW14 8.300 reg 0.038428 Dairy 87.3198 OUT017 2007 NaN Tier 2 Supermarket Type1
2 NCN55 14.600 Low Fat 0.099575 Others 241.7538 OUT010 1998 NaN Tier 3 Grocery Store
3 FDQ58 7.315 Low Fat 0.015388 Snack Foods 155.0340 OUT017 2007 NaN Tier 2 Supermarket Type1
4 FDY38 NaN Regular 0.118599 Dairy 234.2300 OUT027 1985 Medium Tier 3 Supermarket Type3

In [34]:
# Find columns with NaN

pd.isnull(df).sum()>0


Out[34]:
Item_Identifier              False
Item_Weight                   True
Item_Fat_Content             False
Item_Visibility              False
Item_Type                    False
Item_MRP                     False
Outlet_Identifier            False
Outlet_Establishment_Year    False
Outlet_Size                   True
Outlet_Location_Type         False
Outlet_Type                  False
dtype: bool

In [35]:
# replace NaN with specific values

df['Item_Weight'].fillna(0, inplace=True)
df['Outlet_Size'].fillna('MISSING', inplace=True)

In [36]:
df.head()


Out[36]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type
0 FDW58 20.750 Low Fat 0.007565 Snack Foods 107.8622 OUT049 1999 Medium Tier 1 Supermarket Type1
1 FDW14 8.300 reg 0.038428 Dairy 87.3198 OUT017 2007 MISSING Tier 2 Supermarket Type1
2 NCN55 14.600 Low Fat 0.099575 Others 241.7538 OUT010 1998 MISSING Tier 3 Grocery Store
3 FDQ58 7.315 Low Fat 0.015388 Snack Foods 155.0340 OUT017 2007 MISSING Tier 2 Supermarket Type1
4 FDY38 0.000 Regular 0.118599 Dairy 234.2300 OUT027 1985 Medium Tier 3 Supermarket Type3

In [37]:
df.dtypes


Out[37]:
Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
dtype: object

In [38]:
# Remove identifier
df = df.drop('Item_Identifier', axis=1)

# Convert Categorical to Numerical
df.Item_Fat_Content = pd.Categorical(df.Item_Fat_Content)
df['Item_Fat_Content'] = df.Item_Fat_Content.cat.codes

df.Item_Type = pd.Categorical(df.Item_Type)
df['Item_Type'] = df.Item_Type.cat.codes

df.Outlet_Identifier = pd.Categorical(df.Outlet_Identifier)
df['Outlet_Identifier'] = df.Outlet_Identifier.cat.codes

df.Outlet_Size = pd.Categorical(df.Outlet_Size)
df['Outlet_Size'] = df.Outlet_Size.cat.codes

df.Outlet_Location_Type = pd.Categorical(df.Outlet_Location_Type)
df['Outlet_Location_Type'] = df.Outlet_Location_Type.cat.codes

df.Outlet_Type = pd.Categorical(df.Outlet_Type)
df['Outlet_Type'] = df.Outlet_Type.cat.codes

df.Outlet_Establishment_Year = pd.Categorical(df.Outlet_Establishment_Year)
df['Outlet_Establishment_Year'] = df.Outlet_Establishment_Year.cat.codes

In [39]:
df.head()


Out[39]:
Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type
0 20.750 1 0.007565 13 107.8622 9 4 2 0 1
1 8.300 4 0.038428 4 87.3198 2 7 1 1 1
2 14.600 1 0.099575 11 241.7538 0 3 1 2 0
3 7.315 1 0.015388 13 155.0340 2 7 1 1 1
4 0.000 2 0.118599 4 234.2300 5 0 2 2 3

In [40]:
# Calculate LOF
def knn(df,k):
    nbrs = NearestNeighbors(n_neighbors=3)
    nbrs.fit(df)
    distances, indices = nbrs.kneighbors(df)
    return distances, indices

def reachDist(df,MinPts,knnDist):
    nbrs = NearestNeighbors(n_neighbors=MinPts)
    nbrs.fit(df)
    distancesMinPts, indicesMinPts = nbrs.kneighbors(df)
    distancesMinPts[:,0] = np.amax(distancesMinPts,axis=1)
    distancesMinPts[:,1] = np.amax(distancesMinPts,axis=1)
    distancesMinPts[:,2] = np.amax(distancesMinPts,axis=1)
    return distancesMinPts, indicesMinPts

def ird(MinPts,knnDistMinPts):
    return (MinPts/np.sum(knnDistMinPts,axis=1))

def lof(Ird,MinPts,dsts):
    lof=[]
    for item in dsts:
        tempIrd = np.divide(Ird[item[1:]],Ird[item[0]])
        lof.append(tempIrd.sum()/MinPts)
    return lof


# Create possible subspaces in each step
def comboGenerator(startPoint,space,n):
    combosFinal=[]
    for item in itertools.combinations(list(set(space)-set(startPoint)),(n-len(startPoint))):
        combosFinal.append(sorted(startPoint+list(item)))
    return combosFinal

In [41]:
# Calculate the index, we use this for selecting random sections in subspace
## and scaling

index_df = (df.rank()/df.rank().max()).iloc[:,:-1]
print index_df.head()


   Item_Weight  Item_Fat_Content  Item_Visibility  Item_Type  Item_MRP  \
0     0.985908          0.337528         0.073755   0.869062  0.332160   
1     0.361723          1.000000         0.364725   0.238659  0.210790   
2     0.682667          0.337528         0.779792   0.784558  0.931526   
3     0.296547          0.337528         0.130611   0.869062  0.580708   
4     0.086049          0.809925         0.847914   0.238659  0.908819   

   Outlet_Identifier  Outlet_Establishment_Year  Outlet_Size  \
0           1.000000                   0.539321     0.646505   
1           0.242018                   0.885063     0.291577   
2           0.034534                   0.447185     0.291577   
3           0.242018                   0.885063     0.291577   
4           0.538118                   0.090926     0.646505   

   Outlet_Location_Type  
0              0.174480  
1              0.552136  
2              1.000000  
3              0.552136  
4              1.000000  

In [28]:
df['Outlet_Type'] = df['Outlet_Type'].astype('category')

In [42]:
df.dtypes


Out[42]:
Item_Weight                  float64
Item_Fat_Content                int8
Item_Visibility              float64
Item_Type                       int8
Item_MRP                     float64
Outlet_Identifier               int8
Outlet_Establishment_Year       int8
Outlet_Size                     int8
Outlet_Location_Type            int8
Outlet_Type                     int8
dtype: object

In [43]:
# Start with 2-D subspaces
listOfCombos = comboGenerator([],df.columns[:-1],2)
testedCombos=[]
selection=[]
# Calculate the contrast score for each subspace
# For each subspace that satisfies the cut_off point criteria, add additional dimensions
while(len(listOfCombos)>0):
    if listOfCombos[0] not in testedCombos:
        alpha1 = pow(0.2,(float(1)/float(len(listOfCombos[0]))))
        pvalue_Total =0
        pvalue_cnt = 0
        avg_pvalue=0
        for i in range(0,50):
            lband = random.random()
            uband = lband+alpha1
            v = random.randint(0,(len(listOfCombos[0])-1))
            rest = list(set(listOfCombos[0])-set([listOfCombos[0][v]]))
            k=stats.ks_2samp(df[listOfCombos[0][v]].values, df[((index_df[rest]<uband) & (index_df[rest]>lband)).all(axis=1)][listOfCombos[0][v]].values)
            if not(np.isnan(k.pvalue)):
                pvalue_Total = pvalue_Total+k.pvalue
                pvalue_cnt = pvalue_cnt+1
        if pvalue_cnt>0:
            avg_pvalue = pvalue_Total/pvalue_cnt
        if (1.0-avg_pvalue)>0.75:
            selection.append(listOfCombos[0])
            listOfCombos = listOfCombos + comboGenerator(listOfCombos[0],df.columns[:-1],(len(listOfCombos[0])+1))
        testedCombos.append(listOfCombos[0])
        listOfCombos.pop(0)
        listOfCombos = [list(t) for t in set(map(tuple,listOfCombos))]
    else:
        listOfCombos.pop(0)

In [24]:
# selected feature combinations

print selection[7]


['Item_MRP', 'Item_Visibility', 'Outlet_Establishment_Year', 'Outlet_Identifier', 'Outlet_Location_Type', 'Outlet_Size']

In [47]:
# check unique values in a column

df.Outlet_Type.unique()


Out[47]:
array([1, 0, 3, 2])

In [50]:
# Calculate the contrast score 50 times for each subspace, average the contrast scores from iterations
scoresList=[]
for item in selection:
    m=50
    knndist, knnindices = knn(df[item],3)
    reachdist, reachindices = reachDist(df[item],m,knndist)
    irdMatrix = ird(m,reachdist)
    lofScores = lof(irdMatrix,m,reachindices)
    scoresList.append(lofScores)

# Calculate average LOF score for each data point from each subspace
avgs = np.nanmean(np.ma.masked_invalid(np.array(scoresList)),axis=0)

# Scale the results to 0,1 range
scaled_avgs = MinMaxScaler().fit_transform(avgs.reshape(-1,1))

# Here is the AUC score from HiCS
print "HCiS AUC Score"
print metrics.roc_auc_score(label_binarize(df['Outlet_Type'], classes=[0, 1, 2, 3]),scaled_avgs)  # CHANGE to your label name

# Got 
# HCiS AUC Score: 0.605753326865


HCiS AUC Score
0.605753326865
/Library/Python/2.7/site-packages/ipykernel-4.2.1-py2.7.egg/ipykernel/__main__.py:18: RuntimeWarning: divide by zero encountered in divide
/Library/Python/2.7/site-packages/ipykernel-4.2.1-py2.7.egg/ipykernel/__main__.py:23: RuntimeWarning: invalid value encountered in divide

In [52]:
# Calculate LOF AUC

m=50
knndist, knnindices = knn(df.iloc[:,:-1],3)
reachdist, reachindices = reachDist(df.iloc[:,:-1],m,knndist)
irdMatrix = ird(m,reachdist)
lofScores = lof(irdMatrix,m,reachindices)
ss=MinMaxScaler().fit_transform(np.array(lofScores).reshape(-1,1))
print "LOF AUC Score"
print metrics.roc_auc_score(label_binarize(df['Outlet_Type'], classes=[0, 1, 2, 3]),ss)   # CHANGE to your label name


LOF AUC Score
0.555114263833