notebook.community

Edit and run



In [ ]:

    
# Try HiCS compare it with LOF and PCA



In [49]:

    
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
from scipy import stats
import itertools
import random
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler, label_binarize
from sklearn import metrics



In [32]:

    
df = pd.read_csv('PCA_test.csv')



In [33]:

    
df.head()









    Out[33]:






  
    
      
      Item_Identifier
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
    
  
  
    
      0
      FDW58
      20.750
      Low Fat
      0.007565
      Snack Foods
      107.8622
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
    
    
      1
      FDW14
      8.300
      reg
      0.038428
      Dairy
      87.3198
      OUT017
      2007
      NaN
      Tier 2
      Supermarket Type1
    
    
      2
      NCN55
      14.600
      Low Fat
      0.099575
      Others
      241.7538
      OUT010
      1998
      NaN
      Tier 3
      Grocery Store
    
    
      3
      FDQ58
      7.315
      Low Fat
      0.015388
      Snack Foods
      155.0340
      OUT017
      2007
      NaN
      Tier 2
      Supermarket Type1
    
    
      4
      FDY38
      NaN
      Regular
      0.118599
      Dairy
      234.2300
      OUT027
      1985
      Medium
      Tier 3
      Supermarket Type3



In [34]:

    
# Find columns with NaN

pd.isnull(df).sum()>0









    Out[34]:





Item_Identifier              False
Item_Weight                   True
Item_Fat_Content             False
Item_Visibility              False
Item_Type                    False
Item_MRP                     False
Outlet_Identifier            False
Outlet_Establishment_Year    False
Outlet_Size                   True
Outlet_Location_Type         False
Outlet_Type                  False
dtype: bool



In [35]:

    
# replace NaN with specific values

df['Item_Weight'].fillna(0, inplace=True)
df['Outlet_Size'].fillna('MISSING', inplace=True)



In [36]:

    
df.head()









    Out[36]:






  
    
      
      Item_Identifier
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
    
  
  
    
      0
      FDW58
      20.750
      Low Fat
      0.007565
      Snack Foods
      107.8622
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
    
    
      1
      FDW14
      8.300
      reg
      0.038428
      Dairy
      87.3198
      OUT017
      2007
      MISSING
      Tier 2
      Supermarket Type1
    
    
      2
      NCN55
      14.600
      Low Fat
      0.099575
      Others
      241.7538
      OUT010
      1998
      MISSING
      Tier 3
      Grocery Store
    
    
      3
      FDQ58
      7.315
      Low Fat
      0.015388
      Snack Foods
      155.0340
      OUT017
      2007
      MISSING
      Tier 2
      Supermarket Type1
    
    
      4
      FDY38
      0.000
      Regular
      0.118599
      Dairy
      234.2300
      OUT027
      1985
      Medium
      Tier 3
      Supermarket Type3



In [37]:

    
df.dtypes









    Out[37]:





Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
dtype: object



In [38]:

    
# Remove identifier
df = df.drop('Item_Identifier', axis=1)

# Convert Categorical to Numerical
df.Item_Fat_Content = pd.Categorical(df.Item_Fat_Content)
df['Item_Fat_Content'] = df.Item_Fat_Content.cat.codes

df.Item_Type = pd.Categorical(df.Item_Type)
df['Item_Type'] = df.Item_Type.cat.codes

df.Outlet_Identifier = pd.Categorical(df.Outlet_Identifier)
df['Outlet_Identifier'] = df.Outlet_Identifier.cat.codes

df.Outlet_Size = pd.Categorical(df.Outlet_Size)
df['Outlet_Size'] = df.Outlet_Size.cat.codes

df.Outlet_Location_Type = pd.Categorical(df.Outlet_Location_Type)
df['Outlet_Location_Type'] = df.Outlet_Location_Type.cat.codes

df.Outlet_Type = pd.Categorical(df.Outlet_Type)
df['Outlet_Type'] = df.Outlet_Type.cat.codes

df.Outlet_Establishment_Year = pd.Categorical(df.Outlet_Establishment_Year)
df['Outlet_Establishment_Year'] = df.Outlet_Establishment_Year.cat.codes



In [39]:

    
df.head()









    Out[39]:






  
    
      
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
    
  
  
    
      0
      20.750
      1
      0.007565
      13
      107.8622
      9
      4
      2
      0
      1
    
    
      1
      8.300
      4
      0.038428
      4
      87.3198
      2
      7
      1
      1
      1
    
    
      2
      14.600
      1
      0.099575
      11
      241.7538
      0
      3
      1
      2
      0
    
    
      3
      7.315
      1
      0.015388
      13
      155.0340
      2
      7
      1
      1
      1
    
    
      4
      0.000
      2
      0.118599
      4
      234.2300
      5
      0
      2
      2
      3



In [40]:

    
# Calculate LOF
def knn(df,k):
    nbrs = NearestNeighbors(n_neighbors=3)
    nbrs.fit(df)
    distances, indices = nbrs.kneighbors(df)
    return distances, indices

def reachDist(df,MinPts,knnDist):
    nbrs = NearestNeighbors(n_neighbors=MinPts)
    nbrs.fit(df)
    distancesMinPts, indicesMinPts = nbrs.kneighbors(df)
    distancesMinPts[:,0] = np.amax(distancesMinPts,axis=1)
    distancesMinPts[:,1] = np.amax(distancesMinPts,axis=1)
    distancesMinPts[:,2] = np.amax(distancesMinPts,axis=1)
    return distancesMinPts, indicesMinPts

def ird(MinPts,knnDistMinPts):
    return (MinPts/np.sum(knnDistMinPts,axis=1))

def lof(Ird,MinPts,dsts):
    lof=[]
    for item in dsts:
        tempIrd = np.divide(Ird[item[1:]],Ird[item[0]])
        lof.append(tempIrd.sum()/MinPts)
    return lof


# Create possible subspaces in each step
def comboGenerator(startPoint,space,n):
    combosFinal=[]
    for item in itertools.combinations(list(set(space)-set(startPoint)),(n-len(startPoint))):
        combosFinal.append(sorted(startPoint+list(item)))
    return combosFinal



In [41]:

    
# Calculate the index, we use this for selecting random sections in subspace
## and scaling

index_df = (df.rank()/df.rank().max()).iloc[:,:-1]
print index_df.head()









    



   Item_Weight  Item_Fat_Content  Item_Visibility  Item_Type  Item_MRP  \
0     0.985908          0.337528         0.073755   0.869062  0.332160   
1     0.361723          1.000000         0.364725   0.238659  0.210790   
2     0.682667          0.337528         0.779792   0.784558  0.931526   
3     0.296547          0.337528         0.130611   0.869062  0.580708   
4     0.086049          0.809925         0.847914   0.238659  0.908819   

   Outlet_Identifier  Outlet_Establishment_Year  Outlet_Size  \
0           1.000000                   0.539321     0.646505   
1           0.242018                   0.885063     0.291577   
2           0.034534                   0.447185     0.291577   
3           0.242018                   0.885063     0.291577   
4           0.538118                   0.090926     0.646505   

   Outlet_Location_Type  
0              0.174480  
1              0.552136  
2              1.000000  
3              0.552136  
4              1.000000



In [28]:

    
df['Outlet_Type'] = df['Outlet_Type'].astype('category')



In [42]:

    
df.dtypes









    Out[42]:





Item_Weight                  float64
Item_Fat_Content                int8
Item_Visibility              float64
Item_Type                       int8
Item_MRP                     float64
Outlet_Identifier               int8
Outlet_Establishment_Year       int8
Outlet_Size                     int8
Outlet_Location_Type            int8
Outlet_Type                     int8
dtype: object



In [43]:

    
# Start with 2-D subspaces
listOfCombos = comboGenerator([],df.columns[:-1],2)
testedCombos=[]
selection=[]
# Calculate the contrast score for each subspace
# For each subspace that satisfies the cut_off point criteria, add additional dimensions
while(len(listOfCombos)>0):
    if listOfCombos[0] not in testedCombos:
        alpha1 = pow(0.2,(float(1)/float(len(listOfCombos[0]))))
        pvalue_Total =0
        pvalue_cnt = 0
        avg_pvalue=0
        for i in range(0,50):
            lband = random.random()
            uband = lband+alpha1
            v = random.randint(0,(len(listOfCombos[0])-1))
            rest = list(set(listOfCombos[0])-set([listOfCombos[0][v]]))
            k=stats.ks_2samp(df[listOfCombos[0][v]].values, df[((index_df[rest]<uband) & (index_df[rest]>lband)).all(axis=1)][listOfCombos[0][v]].values)
            if not(np.isnan(k.pvalue)):
                pvalue_Total = pvalue_Total+k.pvalue
                pvalue_cnt = pvalue_cnt+1
        if pvalue_cnt>0:
            avg_pvalue = pvalue_Total/pvalue_cnt
        if (1.0-avg_pvalue)>0.75:
            selection.append(listOfCombos[0])
            listOfCombos = listOfCombos + comboGenerator(listOfCombos[0],df.columns[:-1],(len(listOfCombos[0])+1))
        testedCombos.append(listOfCombos[0])
        listOfCombos.pop(0)
        listOfCombos = [list(t) for t in set(map(tuple,listOfCombos))]
    else:
        listOfCombos.pop(0)



In [24]:

    
# selected feature combinations

print selection[7]









    



['Item_MRP', 'Item_Visibility', 'Outlet_Establishment_Year', 'Outlet_Identifier', 'Outlet_Location_Type', 'Outlet_Size']



In [47]:

    
# check unique values in a column

df.Outlet_Type.unique()









    Out[47]:





array([1, 0, 3, 2])



In [50]:

    
# Calculate the contrast score 50 times for each subspace, average the contrast scores from iterations
scoresList=[]
for item in selection:
    m=50
    knndist, knnindices = knn(df[item],3)
    reachdist, reachindices = reachDist(df[item],m,knndist)
    irdMatrix = ird(m,reachdist)
    lofScores = lof(irdMatrix,m,reachindices)
    scoresList.append(lofScores)

# Calculate average LOF score for each data point from each subspace
avgs = np.nanmean(np.ma.masked_invalid(np.array(scoresList)),axis=0)

# Scale the results to 0,1 range
scaled_avgs = MinMaxScaler().fit_transform(avgs.reshape(-1,1))

# Here is the AUC score from HiCS
print "HCiS AUC Score"
print metrics.roc_auc_score(label_binarize(df['Outlet_Type'], classes=[0, 1, 2, 3]),scaled_avgs)  # CHANGE to your label name

# Got 
# HCiS AUC Score: 0.605753326865









    



HCiS AUC Score
0.605753326865






    



/Library/Python/2.7/site-packages/ipykernel-4.2.1-py2.7.egg/ipykernel/__main__.py:18: RuntimeWarning: divide by zero encountered in divide
/Library/Python/2.7/site-packages/ipykernel-4.2.1-py2.7.egg/ipykernel/__main__.py:23: RuntimeWarning: invalid value encountered in divide



In [52]:

    
# Calculate LOF AUC

m=50
knndist, knnindices = knn(df.iloc[:,:-1],3)
reachdist, reachindices = reachDist(df.iloc[:,:-1],m,knndist)
irdMatrix = ird(m,reachdist)
lofScores = lof(irdMatrix,m,reachindices)
ss=MinMaxScaler().fit_transform(np.array(lofScores).reshape(-1,1))
print "LOF AUC Score"
print metrics.roc_auc_score(label_binarize(df['Outlet_Type'], classes=[0, 1, 2, 3]),ss)   # CHANGE to your label name









    



LOF AUC Score
0.555114263833

	Item_Identifier	Item_Weight	Item_Fat_Content	Item_Visibility	Item_Type	Item_MRP	Outlet_Identifier	Outlet_Establishment_Year	Outlet_Size	Outlet_Location_Type	Outlet_Type
0	FDW58	20.750	Low Fat	0.007565	Snack Foods	107.8622	OUT049	1999	Medium	Tier 1	Supermarket Type1
1	FDW14	8.300	reg	0.038428	Dairy	87.3198	OUT017	2007	NaN	Tier 2	Supermarket Type1
2	NCN55	14.600	Low Fat	0.099575	Others	241.7538	OUT010	1998	NaN	Tier 3	Grocery Store
3	FDQ58	7.315	Low Fat	0.015388	Snack Foods	155.0340	OUT017	2007	NaN	Tier 2	Supermarket Type1
4	FDY38	NaN	Regular	0.118599	Dairy	234.2300	OUT027	1985	Medium	Tier 3	Supermarket Type3