Obidroid Learning Notebook

Loading App Data

I shall be using the previously exported exports/appFeatures.csv data for this notebook



In [1]:

    
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

## Load the appFeatures file
main_appData = pd.read_csv('exports/appFeatures.csv')



In [2]:

    
# This cell creates a dataframe with the same amount of unfair and fair apps, where the fair apps are randomly selected
# TODO - Create a method that implements k-fold validation for app selection
import random

## create unfair apps dataframe and count
df_unfair = main_appData[main_appData.appLabel == 'unfair']
unfair_count = df_unfair.appLabel.count()

## get same number of random fair apps
df_randomly_fair = main_appData[main_appData.appLabel == 'fair'].ix[random.sample(main_appData[main_appData.appLabel == 'fair'].index, unfair_count)]
#print df_randomly_fair.appLabel.count()


# append the newly created dataframe of unfair * 2 rows
appData = df_randomly_fair.append(df_unfair)

# shuffle the dataframe
appData = appData.ix[np.random.permutation(appData.index)]

appData.columns









    Out[2]:





Index([u'appName', u'adjectiveCount', u'avgRating', u'countCapital', u'exclamationCount', u'hasDeveloperEmail', u'hasDeveloperWebsite', u'Unnamed: 7', u'hasPrivacy', u'installs', u'price', u'revSent', u'revLength', u'appLabel'], dtype='object')



In [3]:

    
def trimDataFrame(df):
    """
    Lets create a new dataframe for appFeatures and appLabels
    """

    ## for App Features
    appCols = set(df.columns)
    appCols.remove('appName') # remove app Names column
    appCols.remove('Unnamed: 7') # removing a weird unnamed column
    appCols.remove('appLabel') # removing the label column
    appCols.remove('price') # removing price since most of the apps are free
    appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0
    
    df_trim = df[list(appCols)]
    
    # -- boolean
    df_trim['hasPrivacy'].astype(bool)
    df_trim['hasDeveloperEmail'].astype(bool)
    df_trim['hasDeveloperWebsite'].astype(bool)
    
    # -- integer
    df_trim['adjectiveCount'].astype(int)
    df_trim['countCapital'].astype(int)
    df_trim['installs'].astype(int)
    df_trim['revSent'].astype(int)
    df_trim['revLength'].astype(int)
    
    # -- float
    df_trim['avgRating'].astype(float)
    
    return df_trim

Now, I want to explicitly set types to all my columns as a better practice



In [4]:

    
# Explicitly casting column types in appFeatures dataframe
appFeatures = trimDataFrame(main_appData)


# -- boolean
appFeatures['hasPrivacy'].astype(bool)
appFeatures['hasDeveloperEmail'].astype(bool)
appFeatures['hasDeveloperWebsite'].astype(bool)

# -- integer
appFeatures['adjectiveCount'].astype(int)
appFeatures['countCapital'].astype(int)
appFeatures['installs'].astype(int)
appFeatures['revSent'].astype(int)
appFeatures['revLength'].astype(int)

# -- float
appFeatures['avgRating'].astype(float)


appFeatures









    Out[4]:






  
    
      
      adjectiveCount
      hasPrivacy
      revLength
      countCapital
      hasDeveloperWebsite
      installs
      hasDeveloperEmail
      avgRating
      revSent
    
  
  
    
      0 
        4
        True
        601
        1
        True
       30000000
        True
       4.051
       -3
    
    
      1 
       13
        True
       1139
       11
        True
       30000000
        True
       4.351
        2
    
    
      2 
       23
        True
       2223
       20
        True
        3000000
       False
       4.555
       -4
    
    
      3 
       10
       False
        804
        5
        True
       30000000
        True
       4.623
        8
    
    
      4 
       22
        True
       1867
       16
        True
        7500000
       False
       4.046
      -11
    
    
      5 
       18
       False
       1162
        6
        True
       30000000
        True
       4.595
        1
    
    
      6 
       18
        True
       1522
       60
        True
       30000000
        True
       4.526
       -4
    
    
      7 
       13
       False
       1895
       19
        True
       30000000
       False
       4.039
       -5
    
    
      8 
       11
        True
       1195
       10
        True
        3000000
        True
       4.400
       -2
    
    
      9 
       19
        True
       1488
       11
        True
         300000
        True
       3.935
       -4
    
    
      10
       18
       False
       1864
       35
        True
        3000000
        True
       4.075
       -5
    
    
      11
       19
       False
       2049
       14
        True
         750000
       False
       3.983
       -2
    
    
      12
        8
       False
        417
        2
        True
       30000000
        True
       4.238
        1
    
    
      13
       16
       False
       1276
       11
        True
        3000000
        True
       3.915
       -3
    
    
      14
       13
       False
       1210
       12
        True
         750000
        True
       4.050
       -3
    
    
      15
       20
        True
       2038
       24
        True
         750000
        True
       3.795
       -7
    
    
      16
       12
       False
       1044
       10
        True
        7500000
        True
       3.997
        1
    
    
      17
       15
        True
       1245
       13
        True
        3000000
        True
       3.212
       -5
    
    
      18
        2
       False
        225
        2
        True
        3000000
        True
       2.611
       -1
    
    
      19
       15
       False
       1120
       10
        True
       30000000
        True
       4.547
       -3
    
    
      20
       22
        True
       1406
       11
        True
        3000000
        True
       2.671
        4
    
    
      21
       13
        True
       1063
       10
        True
         750000
       False
       4.045
       -3
    
    
      22
        7
       False
        855
       10
        True
         750000
        True
       3.555
       -9
    
    
      23
       17
        True
       2147
       38
        True
        3000000
        True
       4.590
       -5
    
    
      24
       12
        True
       1189
       13
        True
        7500000
        True
       4.258
       -9
    
    
      25
       19
       False
       1804
        8
        True
         750000
        True
       4.428
      -10
    
    
      26
       26
       False
       1514
        5
        True
        3000000
        True
       4.401
        5
    
    
      27
       12
        True
       1272
       12
        True
        3000000
        True
       4.275
       -6
    
    
      28
       17
        True
       1413
       14
        True
         750000
        True
       4.149
       -8
    
    
      29
        7
        True
        610
        2
        True
         300000
        True
       4.396
       -3
    
    
      30
       14
        True
       1145
        5
        True
        3000000
        True
       4.113
        2
    
    
      31
       14
       False
       1413
       14
        True
          30000
        True
       4.240
        2
    
    
      32
        2
        True
        573
       14
        True
         300000
        True
       4.241
       -4
    
    
      33
       12
        True
       1387
       16
        True
        3000000
        True
       3.989
       -6
    
    
      34
       19
        True
       1336
        7
        True
          75000
        True
       4.310
       -7
    
    
      35
        7
        True
        817
        5
        True
        3000000
        True
       4.451
        5
    
    
      36
       29
        True
       2205
       16
        True
         300000
        True
       3.916
       -9
    
    
      37
        7
       False
        541
        3
        True
         300000
        True
       4.761
        1
    
    
      38
        6
       False
        310
        1
        True
         300000
        True
       4.158
        0
    
    
      39
        6
       False
        880
        4
        True
         300000
        True
       2.972
       -4
    
    
      40
        5
       False
        583
        4
        True
         300000
        True
       3.903
       -6
    
    
      41
       19
       False
       1888
       20
        True
        3000000
        True
       3.433
       -9
    
    
      42
       13
       False
       1122
        4
        True
        3000000
        True
       4.412
       -3
    
    
      43
       15
        True
       1613
       21
        True
        3000000
       False
       4.461
       15
    
    
      44
        5
       False
       1014
       11
       False
         300000
        True
       3.564
       -2
    
    
      45
        8
       False
       1115
        6
       False
        3000000
        True
       4.131
        5
    
    
      46
        6
        True
        437
        8
        True
          30000
        True
       3.550
       -1
    
    
      47
        2
        True
        307
        0
        True
        3000000
        True
       4.435
        1
    
    
      48
       13
        True
        991
        4
        True
         300000
        True
       4.233
       -3
    
    
      49
        7
       False
        578
        6
        True
        3000000
        True
       3.975
        1
    
    
      50
       10
        True
        962
        7
        True
         300000
        True
       3.926
        4
    
    
      51
       13
       False
       1267
        6
        True
        3000000
        True
       4.590
        2
    
    
      52
        9
       False
       1300
        8
        True
         300000
        True
       3.601
       -9
    
    
      53
       16
       False
       1051
       12
        True
         300000
        True
       3.701
        3
    
    
      54
       16
        True
       1822
       10
        True
         300000
        True
       2.931
      -13
    
    
      55
        7
       False
        535
        0
        True
        3000000
        True
       4.564
        0
    
    
      56
       11
       False
       1075
        8
        True
        7500000
        True
       4.179
       -7
    
    
      57
        6
        True
        691
        4
        True
        3000000
        True
       4.466
        7
    
    
      58
        7
        True
        991
        5
        True
         750000
       False
       4.340
       -3
    
    
      59
        7
       False
        805
       14
        True
         300000
        True
       4.539
        3
    
    
      
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
  

323 rows × 9 columns

Understanding the Data

Histograms



In [5]:

    
appFeatures['adjectiveCount'].hist(color=['#af9ecb'])









    Out[5]:





<matplotlib.axes.AxesSubplot at 0x109c89750>



In [6]:

    
appFeatures['avgRating'].hist(color=['#af9ecb'])









    Out[6]:





<matplotlib.axes.AxesSubplot at 0x107e46690>



In [7]:

    
appFeatures['countCapital'].hist(color=['#af9ecb'])









    Out[7]:





<matplotlib.axes.AxesSubplot at 0x109f1b3d0>



In [8]:

    
appFeatures['hasDeveloperEmail'].hist(color=['#af9ecb'])









    Out[8]:





<matplotlib.axes.AxesSubplot at 0x10a1286d0>



In [9]:

    
appFeatures['hasDeveloperWebsite'].hist(color=['#af9ecb'])









    Out[9]:





<matplotlib.axes.AxesSubplot at 0x10a138610>



In [10]:

    
appFeatures['hasPrivacy'].hist(color=['#af9ecb'])









    Out[10]:





<matplotlib.axes.AxesSubplot at 0x10a1d6a90>



In [11]:

    
appFeatures['installs'].hist(color=['#af9ecb'])









    Out[11]:





<matplotlib.axes.AxesSubplot at 0x10a729a90>



In [12]:

    
appFeatures['revSent'].hist(color=['#af9ecb'])









    Out[12]:





<matplotlib.axes.AxesSubplot at 0x10a7c5d10>



In [13]:

    
appFeatures['revLength'].hist(color=['#af9ecb'])









    Out[13]:





<matplotlib.axes.AxesSubplot at 0x10a7dcdd0>



In [14]:

    
appFeatures.describe()









    Out[14]:






  
    
      
      adjectiveCount
      hasPrivacy
      revLength
      countCapital
      hasDeveloperWebsite
      installs
      hasDeveloperEmail
      avgRating
      revSent
    
  
  
    
      count
       323.000000
             323
        323.000000
       323.000000
             323
       3.230000e+02
             323
       323.000000
       323.000000
    
    
      mean
        12.653251
       0.5263158
       1114.975232
        11.789474
       0.9071207
       2.634931e+07
       0.9287926
         4.141879
        -2.281734
    
    
      std
         6.520670
       0.5000817
        498.856574
        10.834108
       0.2907135
       8.654108e+07
         0.25757
         0.497807
         4.932892
    
    
      min
         0.000000
           False
         17.000000
         0.000000
           False
       3.000000e+02
           False
         1.000000
       -17.000000
    
    
      25%
         8.000000
               0
        748.500000
         6.000000
               1
       3.000000e+05
               1
         4.003500
        -5.000000
    
    
      50%
        13.000000
               1
       1123.000000
        10.000000
               1
       3.000000e+06
               1
         4.242000
        -2.000000
    
    
      75%
        17.000000
               1
       1458.500000
        15.000000
               1
       7.500000e+06
               1
         4.431000
         1.000000
    
    
      max
        41.000000
            True
       2454.000000
       109.000000
            True
       7.500000e+08
            True
         4.845000
        15.000000
    
  

8 rows × 9 columns

Variable Relations

Plots inspired from Graphical Representations of linear models notebook



In [15]:

    
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt



sns.set(palette="Purples_r")
np.random.seed(9221999)
mpl.rc("figure", figsize=(10, 10))

Review Sentiment vs Avg Rating



In [16]:

    
x = appFeatures['revSent'].astype(float)
y = appFeatures['avgRating'].astype(float)

sns.regplot(x,y)

Review Sentiment vs Review Length



In [17]:

    
x = appFeatures['revSent'].astype(float)
y = appFeatures['revLength'].astype(float)

sns.regplot(x,y)

Review Length vs Avg Rating



In [18]:

    
x = appFeatures['revLength'].astype(float)
y = appFeatures['revSent'].astype(float)

sns.regplot(x,y)

Pairwise correlation heatmaps



In [19]:

    
f, ax = plt.subplots(1, 1, figsize=(10, 10))
cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF",
                          "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True)


sns.corrplot(appFeatures, annot=False, diag_names=False, cmap=cmap, ax=ax);



In [20]:

    
sns.corrplot(appFeatures) #with values









    Out[20]:





<matplotlib.axes.AxesSubplot at 0x10cc9c9d0>



In [21]:

    
%doctest_mode









    



Exception reporting mode: Plain
Doctest mode is: ON

Unsupervised Learning

Scaling the feature vector



In [22]:

    
# get all the rows for unsupervised learning
appData_all = trimDataFrame(main_appData)

appData_all.head()









    Out[22]:





   adjectiveCount hasPrivacy  revLength  countCapital hasDeveloperWebsite  \
0               4       True        601             1                True   
1              13       True       1139            11                True   
2              23       True       2223            20                True   
3              10      False        804             5                True   
4              22       True       1867            16                True   

   installs hasDeveloperEmail  avgRating  revSent  
0  30000000              True      4.051       -3  
1  30000000              True      4.351        2  
2   3000000             False      4.555       -4  
3  30000000              True      4.623        8  
4   7500000             False      4.046      -11  

[5 rows x 9 columns]



In [23]:

    
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

print(appData_all.columns)

# scale the dataframe
X_scaled = min_max_scaler.fit_transform(appData_all)

print (X_scaled)
print X_scaled.astype('float64')
Y = main_appData['appLabel']









    



Index([u'adjectiveCount', u'hasPrivacy', u'revLength', u'countCapital', u'hasDeveloperWebsite', u'installs', u'hasDeveloperEmail', u'avgRating', u'revSent'], dtype='object')
[[0 1 0 ..., 1 0.7934980494148245 0]
 [0 1 0 ..., 1 0.8715214564369311 0]
 [0 1 0 ..., 0 0.9245773732119635 0]
 ..., 
 [0 1 0 ..., 1 0.8905071521456436 0]
 [0 1 0 ..., 1 0.9193758127438232 0]
 [0 1 0 ..., 1 0.9243172951885565 0]]
[[ 0.          1.          0.         ...,  1.          0.79349805  0.        ]
 [ 0.          1.          0.         ...,  1.          0.87152146  0.        ]
 [ 0.          1.          0.         ...,  0.          0.92457737  0.        ]
 ..., 
 [ 0.          1.          0.         ...,  1.          0.89050715  0.        ]
 [ 0.          1.          0.         ...,  1.          0.91937581  0.        ]
 [ 0.          1.          0.         ...,  1.          0.9243173   0.        ]]






    



/Users/shreyas/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:278: UserWarning: MinMaxScaler assumes floating point values as input, got object
  "got %s" % (estimator, X.dtype))



In [24]:

    
# Change plot string for fair and unfair apps\n
def set_plot_symbol(fairness, fair_char = '.', unfair_char = '+'):
    symbol = ''
    if fairness == 'unfair':
        symbol = unfair_char
    elif fairness == 'fair':
        symbol = fair_char
    
    return symbol



In [41]:

    
# Code reference: http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html#example-manifold-plot-lle-digits-py
#====================
#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    pl.figure()
    ax = pl.subplot(111)
    for i in range(X.shape[0]):
        pl.text(X[i, 0], X[i, 1], str(set_plot_symbol(Y[i])),
#                 color=pl.cm.Set1(y[i] / 2.),
                fontdict={'weight': 'bold', 'size': 9})


    pl.xticks([]), pl.yticks([])
    if title is not None:
        pl.title(title)



In [42]:

    
from time import time
import pylab as pl
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
                     random_projection)



In [43]:

    
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X_scaled)
print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_mds,
               "MDS embedding of the digits (time %.2fs)" %
               (time() - t0))









    



Computing MDS embedding
Done. Stress: 596.376730






    





<matplotlib.figure.Figure object at 0x10cc9c0d0>



In [44]:

    
#----------------------------------------------------------------------
# Projection on to the first 2 principal components

print("Computing PCA projection")
t0 = time()
X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(X_scaled)
plot_embedding(X_pca,
               "Principal Components projection of the digits (time %.2fs)" %
               (time() - t0))









    



Computing PCA projection






    





<matplotlib.figure.Figure object at 0x112b328d0>



In [29]:

    
#----------------------------------------------------------------------
# Projection on to the first 2 linear discriminant components


print("Computing LDA projection")
X_converted = X_scaled.astype('float64')
X2 = X_converted.copy()
X2.flat[::X_converted.shape[1] + 1] += 0.01  # Make X invertible
t0 = time()
X_lda = lda.LDA(n_components=2).fit_transform(X2, y)
plot_embedding(X_lda,
               "Linear Discriminant projection of the digits (time %.2fs)" %
               (time() - t0))









    



Computing LDA projection






    





<matplotlib.figure.Figure object at 0x110597590>



In [30]:

    
#----------------------------------------------------------------------
# Isomap projection of the digits dataset
n_neighbors = 10
print("Computing Isomap embedding")
t0 = time()
X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X_scaled)
print("Done.")
plot_embedding(X_iso,
               "Isomap projection of the digits (time %.2fs)" %
               (time() - t0))









    



Computing Isomap embedding
Done.






    





<matplotlib.figure.Figure object at 0x112060390>



In [31]:

    
#----------------------------------------------------------------------
# Locally linear embedding of the digits dataset
print("Computing LLE embedding")
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                      method='standard')
t0 = time()
X_lle = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
plot_embedding(X_lle,
               "Locally Linear Embedding of the digits (time %.2fs)" %
               (time() - t0))









    



Computing LLE embedding
Done. Reconstruction error: 1.99238e-17






    





<matplotlib.figure.Figure object at 0x111ee02d0>



In [32]:

    
#----------------------------------------------------------------------
# Modified Locally linear embedding of the digits dataset
print("Computing modified LLE embedding")
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                      method='modified')
t0 = time()
X_mlle = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
plot_embedding(X_mlle,
               "Modified Locally Linear Embedding of the digits (time %.2fs)" %
               (time() - t0))









    



Computing modified LLE embedding
Done. Reconstruction error: 3.96675e-17






    





<matplotlib.figure.Figure object at 0x110597310>



In [33]:

    
#----------------------------------------------------------------------
# HLLE embedding of the digits dataset
print("Computing Hessian LLE embedding")
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                      method='hessian')
t0 = time()
X_hlle = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
plot_embedding(X_hlle,
               "Hessian Locally Linear Embedding of the digits (time %.2fs)" %
               (time() - t0))









    



Computing Hessian LLE embedding
Done. Reconstruction error: -2.1499e-18






    





<matplotlib.figure.Figure object at 0x111e61e10>



In [34]:

    
#----------------------------------------------------------------------
# MDS  embedding of the digits dataset
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_mds,
               "MDS embedding of the digits (time %.2fs)" %
               (time() - t0))









    



Computing MDS embedding
Done. Stress: 1101.170364






    





<matplotlib.figure.Figure object at 0x111e66550>



In [35]:

    
#----------------------------------------------------------------------
# Random Trees embedding of the digits dataset
print("Computing Totally Random Trees embedding")
hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
                                       max_depth=5)
t0 = time()
X_transformed = hasher.fit_transform(X_scaled.astype('float64'))
pca = decomposition.TruncatedSVD(n_components=2)
X_reduced = pca.fit_transform(X_transformed)

plot_embedding(X_reduced,
               "Random forest embedding of the digits (time %.2fs)" %
               (time() - t0))









    



Computing Totally Random Trees embedding






    





<matplotlib.figure.Figure object at 0x111fe5990>



In [36]:

    
#----------------------------------------------------------------------
# Spectral embedding of the digits dataset
print("Computing Spectral embedding")
embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
                                      eigen_solver="arpack")
t0 = time()
X_se = embedder.fit_transform(X_scaled.astype('float64'))

plot_embedding(X_se,
               "Spectral embedding of the digits (time %.2fs)" %
               (time() - t0))

pl.show()









    



Computing Spectral embedding






    





<matplotlib.figure.Figure object at 0x111f74f50>



In [37]:

    
#
# ---------------K-means Clustering
#
from sklearn.cluster import KMeans

print("Computing K-means clustering")
kmeans = KMeans(init='k-means++', n_clusters=7, n_init=10)
t0 = time()
X_kmeans = kmeans.fit_transform(X_scaled)
# print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_kmeans,
               "Kmeans clusters (time %.2fs)" %
               (time() - t0))









    



Computing K-means clustering






    





<matplotlib.figure.Figure object at 0x110461fd0>



In [38]:

    
"""
Code reference: http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python
"""

from __future__ import division


# dendrogram plotting from scipy book
import numpy as np
import matplotlib.pyplot as mpl
from mpl_toolkits.mplot3d import Axes3D
import scipy.cluster.hierarchy as sch
from sklearn.metrics.pairwise import pairwise_distances as pwd
import pylab

# compute the distance matrix
D = pwd(X_converted, metric='euclidean')

# compute and plot the first dendrogram
fig = pylab.figure(figsize=(8,8))
ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
Y_sch = sch.linkage(D, method="centroid")
Z1 = sch.dendrogram(Y_sch, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])


# compute the second cluster
ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
Y_sch = sch.linkage(D, method='single')
Z2 = sch.dendrogram(Y_sch)
ax2.set_xticks([])
ax2.set_yticks([])

# plot the distance matrix
axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1, :]
D = D[:, idx2]

im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])

#plot colorbar
axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6])
pylab.colorbar(im, cax=axcolor)
fig.show()
fig.savefig('dendrogram.png')









    



/Users/shreyas/anaconda/lib/python2.7/site-packages/matplotlib/figure.py:371: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "






    





<matplotlib.figure.Figure object at 0x1105b9d50>

Supervised Learning

To be able to assess our classifier better, We have resampled our data to include equal number of fair and unfair apps. We will shuffle the records and do 10-fold validation on it.



In [39]:

    
# first get the feature matrix
# from Luis's random sampling of the dataframe

appData

X_resampled = trimDataFrame(appData)

X_resampled

X_for_classifier =  min_max_scaler.fit_transform(X_resampled).astype('float64')

np.random.shuffle(X_for_classifier)

n_sample, n_features = X_for_classifier.shape



In [40]:

    
# setting up the cross validation
from sklearn import cross_validation
kf = cross_validation.KFold(n_sample, n_folds=3)



In [40]:

	adjectiveCount	hasPrivacy	revLength	countCapital	hasDeveloperWebsite	installs	hasDeveloperEmail	avgRating	revSent
0	4	True	601	1	True	30000000	True	4.051	-3
1	13	True	1139	11	True	30000000	True	4.351	2
2	23	True	2223	20	True	3000000	False	4.555	-4
3	10	False	804	5	True	30000000	True	4.623	8
4	22	True	1867	16	True	7500000	False	4.046	-11
5	18	False	1162	6	True	30000000	True	4.595	1
6	18	True	1522	60	True	30000000	True	4.526	-4
7	13	False	1895	19	True	30000000	False	4.039	-5
8	11	True	1195	10	True	3000000	True	4.400	-2
9	19	True	1488	11	True	300000	True	3.935	-4
10	18	False	1864	35	True	3000000	True	4.075	-5
11	19	False	2049	14	True	750000	False	3.983	-2
12	8	False	417	2	True	30000000	True	4.238	1
13	16	False	1276	11	True	3000000	True	3.915	-3
14	13	False	1210	12	True	750000	True	4.050	-3
15	20	True	2038	24	True	750000	True	3.795	-7
16	12	False	1044	10	True	7500000	True	3.997	1
17	15	True	1245	13	True	3000000	True	3.212	-5
18	2	False	225	2	True	3000000	True	2.611	-1
19	15	False	1120	10	True	30000000	True	4.547	-3
20	22	True	1406	11	True	3000000	True	2.671	4
21	13	True	1063	10	True	750000	False	4.045	-3
22	7	False	855	10	True	750000	True	3.555	-9
23	17	True	2147	38	True	3000000	True	4.590	-5
24	12	True	1189	13	True	7500000	True	4.258	-9
25	19	False	1804	8	True	750000	True	4.428	-10
26	26	False	1514	5	True	3000000	True	4.401	5
27	12	True	1272	12	True	3000000	True	4.275	-6
28	17	True	1413	14	True	750000	True	4.149	-8
29	7	True	610	2	True	300000	True	4.396	-3
30	14	True	1145	5	True	3000000	True	4.113	2
31	14	False	1413	14	True	30000	True	4.240	2
32	2	True	573	14	True	300000	True	4.241	-4
33	12	True	1387	16	True	3000000	True	3.989	-6
34	19	True	1336	7	True	75000	True	4.310	-7
35	7	True	817	5	True	3000000	True	4.451	5
36	29	True	2205	16	True	300000	True	3.916	-9
37	7	False	541	3	True	300000	True	4.761	1
38	6	False	310	1	True	300000	True	4.158	0
39	6	False	880	4	True	300000	True	2.972	-4
40	5	False	583	4	True	300000	True	3.903	-6
41	19	False	1888	20	True	3000000	True	3.433	-9
42	13	False	1122	4	True	3000000	True	4.412	-3
43	15	True	1613	21	True	3000000	False	4.461	15
44	5	False	1014	11	False	300000	True	3.564	-2
45	8	False	1115	6	False	3000000	True	4.131	5
46	6	True	437	8	True	30000	True	3.550	-1
47	2	True	307	0	True	3000000	True	4.435	1
48	13	True	991	4	True	300000	True	4.233	-3
49	7	False	578	6	True	3000000	True	3.975	1
50	10	True	962	7	True	300000	True	3.926	4
51	13	False	1267	6	True	3000000	True	4.590	2
52	9	False	1300	8	True	300000	True	3.601	-9
53	16	False	1051	12	True	300000	True	3.701	3
54	16	True	1822	10	True	300000	True	2.931	-13
55	7	False	535	0	True	3000000	True	4.564	0
56	11	False	1075	8	True	7500000	True	4.179	-7
57	6	True	691	4	True	3000000	True	4.466	7
58	7	True	991	5	True	750000	False	4.340	-3
59	7	False	805	14	True	300000	True	4.539	3
	...	...	...	...	...	...	...	...	...

	adjectiveCount	hasPrivacy	revLength	countCapital	hasDeveloperWebsite	installs	hasDeveloperEmail	avgRating	revSent
count	323.000000	323	323.000000	323.000000	323	3.230000e+02	323	323.000000	323.000000
mean	12.653251	0.5263158	1114.975232	11.789474	0.9071207	2.634931e+07	0.9287926	4.141879	-2.281734
std	6.520670	0.5000817	498.856574	10.834108	0.2907135	8.654108e+07	0.25757	0.497807	4.932892
min	0.000000	False	17.000000	0.000000	False	3.000000e+02	False	1.000000	-17.000000
25%	8.000000	0	748.500000	6.000000	1	3.000000e+05	1	4.003500	-5.000000
50%	13.000000	1	1123.000000	10.000000	1	3.000000e+06	1	4.242000	-2.000000
75%	17.000000	1	1458.500000	15.000000	1	7.500000e+06	1	4.431000	1.000000
max	41.000000	True	2454.000000	109.000000	True	7.500000e+08	True	4.845000	15.000000