Obidroid Learning Notebook

Loading App Data

I shall be using the previously exported exports/appFeatures.csv data for this notebook


In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

## Load the appFeatures file
main_appData = pd.read_csv('exports/appFeatures.csv')

In [2]:
# This cell creates a dataframe with the same amount of unfair and fair apps, where the fair apps are randomly selected
# TODO - Create a method that implements k-fold validation for app selection
import random

## create unfair apps dataframe and count
df_unfair = main_appData[main_appData.appLabel == 'unfair']
unfair_count = df_unfair.appLabel.count()

## get same number of random fair apps
df_randomly_fair = main_appData[main_appData.appLabel == 'fair'].ix[random.sample(main_appData[main_appData.appLabel == 'fair'].index, unfair_count)]
#print df_randomly_fair.appLabel.count()


# append the newly created dataframe of unfair * 2 rows
appData = df_randomly_fair.append(df_unfair)

# shuffle the dataframe
appData = appData.ix[np.random.permutation(appData.index)]

appData.columns


Out[2]:
Index([u'appName', u'adjectiveCount', u'avgRating', u'countCapital', u'exclamationCount', u'hasDeveloperEmail', u'hasDeveloperWebsite', u'Unnamed: 7', u'hasPrivacy', u'installs', u'price', u'revSent', u'revLength', u'appLabel'], dtype='object')

In [3]:
def trimDataFrame(df):
    """
    Lets create a new dataframe for appFeatures and appLabels
    """

    ## for App Features
    appCols = set(df.columns)
    appCols.remove('appName') # remove app Names column
    appCols.remove('Unnamed: 7') # removing a weird unnamed column
    appCols.remove('appLabel') # removing the label column
    appCols.remove('price') # removing price since most of the apps are free
    appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0
    
    df_trim = df[list(appCols)]
    
    # -- boolean
    df_trim['hasPrivacy'].astype(bool)
    df_trim['hasDeveloperEmail'].astype(bool)
    df_trim['hasDeveloperWebsite'].astype(bool)
    
    # -- integer
    df_trim['adjectiveCount'].astype(int)
    df_trim['countCapital'].astype(int)
    df_trim['installs'].astype(int)
    df_trim['revSent'].astype(int)
    df_trim['revLength'].astype(int)
    
    # -- float
    df_trim['avgRating'].astype(float)
    
    return df_trim

Now, I want to explicitly set types to all my columns as a better practice


In [4]:
# Explicitly casting column types in appFeatures dataframe
appFeatures = trimDataFrame(main_appData)


# -- boolean
appFeatures['hasPrivacy'].astype(bool)
appFeatures['hasDeveloperEmail'].astype(bool)
appFeatures['hasDeveloperWebsite'].astype(bool)

# -- integer
appFeatures['adjectiveCount'].astype(int)
appFeatures['countCapital'].astype(int)
appFeatures['installs'].astype(int)
appFeatures['revSent'].astype(int)
appFeatures['revLength'].astype(int)

# -- float
appFeatures['avgRating'].astype(float)


appFeatures


Out[4]:
adjectiveCount hasPrivacy revLength countCapital hasDeveloperWebsite installs hasDeveloperEmail avgRating revSent
0 4 True 601 1 True 30000000 True 4.051 -3
1 13 True 1139 11 True 30000000 True 4.351 2
2 23 True 2223 20 True 3000000 False 4.555 -4
3 10 False 804 5 True 30000000 True 4.623 8
4 22 True 1867 16 True 7500000 False 4.046 -11
5 18 False 1162 6 True 30000000 True 4.595 1
6 18 True 1522 60 True 30000000 True 4.526 -4
7 13 False 1895 19 True 30000000 False 4.039 -5
8 11 True 1195 10 True 3000000 True 4.400 -2
9 19 True 1488 11 True 300000 True 3.935 -4
10 18 False 1864 35 True 3000000 True 4.075 -5
11 19 False 2049 14 True 750000 False 3.983 -2
12 8 False 417 2 True 30000000 True 4.238 1
13 16 False 1276 11 True 3000000 True 3.915 -3
14 13 False 1210 12 True 750000 True 4.050 -3
15 20 True 2038 24 True 750000 True 3.795 -7
16 12 False 1044 10 True 7500000 True 3.997 1
17 15 True 1245 13 True 3000000 True 3.212 -5
18 2 False 225 2 True 3000000 True 2.611 -1
19 15 False 1120 10 True 30000000 True 4.547 -3
20 22 True 1406 11 True 3000000 True 2.671 4
21 13 True 1063 10 True 750000 False 4.045 -3
22 7 False 855 10 True 750000 True 3.555 -9
23 17 True 2147 38 True 3000000 True 4.590 -5
24 12 True 1189 13 True 7500000 True 4.258 -9
25 19 False 1804 8 True 750000 True 4.428 -10
26 26 False 1514 5 True 3000000 True 4.401 5
27 12 True 1272 12 True 3000000 True 4.275 -6
28 17 True 1413 14 True 750000 True 4.149 -8
29 7 True 610 2 True 300000 True 4.396 -3
30 14 True 1145 5 True 3000000 True 4.113 2
31 14 False 1413 14 True 30000 True 4.240 2
32 2 True 573 14 True 300000 True 4.241 -4
33 12 True 1387 16 True 3000000 True 3.989 -6
34 19 True 1336 7 True 75000 True 4.310 -7
35 7 True 817 5 True 3000000 True 4.451 5
36 29 True 2205 16 True 300000 True 3.916 -9
37 7 False 541 3 True 300000 True 4.761 1
38 6 False 310 1 True 300000 True 4.158 0
39 6 False 880 4 True 300000 True 2.972 -4
40 5 False 583 4 True 300000 True 3.903 -6
41 19 False 1888 20 True 3000000 True 3.433 -9
42 13 False 1122 4 True 3000000 True 4.412 -3
43 15 True 1613 21 True 3000000 False 4.461 15
44 5 False 1014 11 False 300000 True 3.564 -2
45 8 False 1115 6 False 3000000 True 4.131 5
46 6 True 437 8 True 30000 True 3.550 -1
47 2 True 307 0 True 3000000 True 4.435 1
48 13 True 991 4 True 300000 True 4.233 -3
49 7 False 578 6 True 3000000 True 3.975 1
50 10 True 962 7 True 300000 True 3.926 4
51 13 False 1267 6 True 3000000 True 4.590 2
52 9 False 1300 8 True 300000 True 3.601 -9
53 16 False 1051 12 True 300000 True 3.701 3
54 16 True 1822 10 True 300000 True 2.931 -13
55 7 False 535 0 True 3000000 True 4.564 0
56 11 False 1075 8 True 7500000 True 4.179 -7
57 6 True 691 4 True 3000000 True 4.466 7
58 7 True 991 5 True 750000 False 4.340 -3
59 7 False 805 14 True 300000 True 4.539 3
... ... ... ... ... ... ... ... ...

323 rows × 9 columns

Understanding the Data

Histograms


In [5]:
appFeatures['adjectiveCount'].hist(color=['#af9ecb'])


Out[5]:
<matplotlib.axes.AxesSubplot at 0x109c89750>

In [6]:
appFeatures['avgRating'].hist(color=['#af9ecb'])


Out[6]:
<matplotlib.axes.AxesSubplot at 0x107e46690>

In [7]:
appFeatures['countCapital'].hist(color=['#af9ecb'])


Out[7]:
<matplotlib.axes.AxesSubplot at 0x109f1b3d0>

In [8]:
appFeatures['hasDeveloperEmail'].hist(color=['#af9ecb'])


Out[8]:
<matplotlib.axes.AxesSubplot at 0x10a1286d0>

In [9]:
appFeatures['hasDeveloperWebsite'].hist(color=['#af9ecb'])


Out[9]:
<matplotlib.axes.AxesSubplot at 0x10a138610>

In [10]:
appFeatures['hasPrivacy'].hist(color=['#af9ecb'])


Out[10]:
<matplotlib.axes.AxesSubplot at 0x10a1d6a90>

In [11]:
appFeatures['installs'].hist(color=['#af9ecb'])


Out[11]:
<matplotlib.axes.AxesSubplot at 0x10a729a90>

In [12]:
appFeatures['revSent'].hist(color=['#af9ecb'])


Out[12]:
<matplotlib.axes.AxesSubplot at 0x10a7c5d10>

In [13]:
appFeatures['revLength'].hist(color=['#af9ecb'])


Out[13]:
<matplotlib.axes.AxesSubplot at 0x10a7dcdd0>

In [14]:
appFeatures.describe()


Out[14]:
adjectiveCount hasPrivacy revLength countCapital hasDeveloperWebsite installs hasDeveloperEmail avgRating revSent
count 323.000000 323 323.000000 323.000000 323 3.230000e+02 323 323.000000 323.000000
mean 12.653251 0.5263158 1114.975232 11.789474 0.9071207 2.634931e+07 0.9287926 4.141879 -2.281734
std 6.520670 0.5000817 498.856574 10.834108 0.2907135 8.654108e+07 0.25757 0.497807 4.932892
min 0.000000 False 17.000000 0.000000 False 3.000000e+02 False 1.000000 -17.000000
25% 8.000000 0 748.500000 6.000000 1 3.000000e+05 1 4.003500 -5.000000
50% 13.000000 1 1123.000000 10.000000 1 3.000000e+06 1 4.242000 -2.000000
75% 17.000000 1 1458.500000 15.000000 1 7.500000e+06 1 4.431000 1.000000
max 41.000000 True 2454.000000 109.000000 True 7.500000e+08 True 4.845000 15.000000

8 rows × 9 columns

Variable Relations


In [15]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt



sns.set(palette="Purples_r")
np.random.seed(9221999)
mpl.rc("figure", figsize=(10, 10))

Review Sentiment vs Avg Rating


In [16]:
x = appFeatures['revSent'].astype(float)
y = appFeatures['avgRating'].astype(float)

sns.regplot(x,y)


Review Sentiment vs Review Length


In [17]:
x = appFeatures['revSent'].astype(float)
y = appFeatures['revLength'].astype(float)

sns.regplot(x,y)


Review Length vs Avg Rating


In [18]:
x = appFeatures['revLength'].astype(float)
y = appFeatures['revSent'].astype(float)

sns.regplot(x,y)


Pairwise correlation heatmaps


In [19]:
f, ax = plt.subplots(1, 1, figsize=(10, 10))
cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF",
                          "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True)


sns.corrplot(appFeatures, annot=False, diag_names=False, cmap=cmap, ax=ax);



In [20]:
sns.corrplot(appFeatures) #with values


Out[20]:
<matplotlib.axes.AxesSubplot at 0x10cc9c9d0>


In [21]:
%doctest_mode


Exception reporting mode: Plain
Doctest mode is: ON

Unsupervised Learning

Scaling the feature vector


In [22]:
# get all the rows for unsupervised learning
appData_all = trimDataFrame(main_appData)

appData_all.head()


Out[22]:
   adjectiveCount hasPrivacy  revLength  countCapital hasDeveloperWebsite  \
0               4       True        601             1                True   
1              13       True       1139            11                True   
2              23       True       2223            20                True   
3              10      False        804             5                True   
4              22       True       1867            16                True   

   installs hasDeveloperEmail  avgRating  revSent  
0  30000000              True      4.051       -3  
1  30000000              True      4.351        2  
2   3000000             False      4.555       -4  
3  30000000              True      4.623        8  
4   7500000             False      4.046      -11  

[5 rows x 9 columns]

In [23]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

print(appData_all.columns)

# scale the dataframe
X_scaled = min_max_scaler.fit_transform(appData_all)

print (X_scaled)
print X_scaled.astype('float64')
Y = main_appData['appLabel']


Index([u'adjectiveCount', u'hasPrivacy', u'revLength', u'countCapital', u'hasDeveloperWebsite', u'installs', u'hasDeveloperEmail', u'avgRating', u'revSent'], dtype='object')
[[0 1 0 ..., 1 0.7934980494148245 0]
 [0 1 0 ..., 1 0.8715214564369311 0]
 [0 1 0 ..., 0 0.9245773732119635 0]
 ..., 
 [0 1 0 ..., 1 0.8905071521456436 0]
 [0 1 0 ..., 1 0.9193758127438232 0]
 [0 1 0 ..., 1 0.9243172951885565 0]]
[[ 0.          1.          0.         ...,  1.          0.79349805  0.        ]
 [ 0.          1.          0.         ...,  1.          0.87152146  0.        ]
 [ 0.          1.          0.         ...,  0.          0.92457737  0.        ]
 ..., 
 [ 0.          1.          0.         ...,  1.          0.89050715  0.        ]
 [ 0.          1.          0.         ...,  1.          0.91937581  0.        ]
 [ 0.          1.          0.         ...,  1.          0.9243173   0.        ]]
/Users/shreyas/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:278: UserWarning: MinMaxScaler assumes floating point values as input, got object
  "got %s" % (estimator, X.dtype))

In [24]:
# Change plot string for fair and unfair apps\n
def set_plot_symbol(fairness, fair_char = '.', unfair_char = '+'):
    symbol = ''
    if fairness == 'unfair':
        symbol = unfair_char
    elif fairness == 'fair':
        symbol = fair_char
    
    return symbol

In [41]:
# Code reference: http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html#example-manifold-plot-lle-digits-py
#====================
#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    pl.figure()
    ax = pl.subplot(111)
    for i in range(X.shape[0]):
        pl.text(X[i, 0], X[i, 1], str(set_plot_symbol(Y[i])),
#                 color=pl.cm.Set1(y[i] / 2.),
                fontdict={'weight': 'bold', 'size': 9})


    pl.xticks([]), pl.yticks([])
    if title is not None:
        pl.title(title)

In [42]:
from time import time
import pylab as pl
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
                     random_projection)

In [43]:
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X_scaled)
print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_mds,
               "MDS embedding of the digits (time %.2fs)" %
               (time() - t0))


Computing MDS embedding
Done. Stress: 596.376730
<matplotlib.figure.Figure object at 0x10cc9c0d0>

In [44]:
#----------------------------------------------------------------------
# Projection on to the first 2 principal components

print("Computing PCA projection")
t0 = time()
X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(X_scaled)
plot_embedding(X_pca,
               "Principal Components projection of the digits (time %.2fs)" %
               (time() - t0))


Computing PCA projection
<matplotlib.figure.Figure object at 0x112b328d0>

In [29]:
#----------------------------------------------------------------------
# Projection on to the first 2 linear discriminant components


print("Computing LDA projection")
X_converted = X_scaled.astype('float64')
X2 = X_converted.copy()
X2.flat[::X_converted.shape[1] + 1] += 0.01  # Make X invertible
t0 = time()
X_lda = lda.LDA(n_components=2).fit_transform(X2, y)
plot_embedding(X_lda,
               "Linear Discriminant projection of the digits (time %.2fs)" %
               (time() - t0))


Computing LDA projection
<matplotlib.figure.Figure object at 0x110597590>

In [30]:
#----------------------------------------------------------------------
# Isomap projection of the digits dataset
n_neighbors = 10
print("Computing Isomap embedding")
t0 = time()
X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X_scaled)
print("Done.")
plot_embedding(X_iso,
               "Isomap projection of the digits (time %.2fs)" %
               (time() - t0))


Computing Isomap embedding
Done.
<matplotlib.figure.Figure object at 0x112060390>

In [31]:
#----------------------------------------------------------------------
# Locally linear embedding of the digits dataset
print("Computing LLE embedding")
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                      method='standard')
t0 = time()
X_lle = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
plot_embedding(X_lle,
               "Locally Linear Embedding of the digits (time %.2fs)" %
               (time() - t0))


Computing LLE embedding
Done. Reconstruction error: 1.99238e-17
<matplotlib.figure.Figure object at 0x111ee02d0>

In [32]:
#----------------------------------------------------------------------
# Modified Locally linear embedding of the digits dataset
print("Computing modified LLE embedding")
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                      method='modified')
t0 = time()
X_mlle = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
plot_embedding(X_mlle,
               "Modified Locally Linear Embedding of the digits (time %.2fs)" %
               (time() - t0))


Computing modified LLE embedding
Done. Reconstruction error: 3.96675e-17
<matplotlib.figure.Figure object at 0x110597310>

In [33]:
#----------------------------------------------------------------------
# HLLE embedding of the digits dataset
print("Computing Hessian LLE embedding")
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
                                      method='hessian')
t0 = time()
X_hlle = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
plot_embedding(X_hlle,
               "Hessian Locally Linear Embedding of the digits (time %.2fs)" %
               (time() - t0))


Computing Hessian LLE embedding
Done. Reconstruction error: -2.1499e-18
<matplotlib.figure.Figure object at 0x111e61e10>

In [34]:
#----------------------------------------------------------------------
# MDS  embedding of the digits dataset
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_mds,
               "MDS embedding of the digits (time %.2fs)" %
               (time() - t0))


Computing MDS embedding
Done. Stress: 1101.170364
<matplotlib.figure.Figure object at 0x111e66550>

In [35]:
#----------------------------------------------------------------------
# Random Trees embedding of the digits dataset
print("Computing Totally Random Trees embedding")
hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
                                       max_depth=5)
t0 = time()
X_transformed = hasher.fit_transform(X_scaled.astype('float64'))
pca = decomposition.TruncatedSVD(n_components=2)
X_reduced = pca.fit_transform(X_transformed)

plot_embedding(X_reduced,
               "Random forest embedding of the digits (time %.2fs)" %
               (time() - t0))


Computing Totally Random Trees embedding
<matplotlib.figure.Figure object at 0x111fe5990>

In [36]:
#----------------------------------------------------------------------
# Spectral embedding of the digits dataset
print("Computing Spectral embedding")
embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
                                      eigen_solver="arpack")
t0 = time()
X_se = embedder.fit_transform(X_scaled.astype('float64'))

plot_embedding(X_se,
               "Spectral embedding of the digits (time %.2fs)" %
               (time() - t0))

pl.show()


Computing Spectral embedding
<matplotlib.figure.Figure object at 0x111f74f50>

In [37]:
#
# ---------------K-means Clustering
#
from sklearn.cluster import KMeans

print("Computing K-means clustering")
kmeans = KMeans(init='k-means++', n_clusters=7, n_init=10)
t0 = time()
X_kmeans = kmeans.fit_transform(X_scaled)
# print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_kmeans,
               "Kmeans clusters (time %.2fs)" %
               (time() - t0))


Computing K-means clustering
<matplotlib.figure.Figure object at 0x110461fd0>

In [38]:
"""
Code reference: http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python
"""

from __future__ import division


# dendrogram plotting from scipy book
import numpy as np
import matplotlib.pyplot as mpl
from mpl_toolkits.mplot3d import Axes3D
import scipy.cluster.hierarchy as sch
from sklearn.metrics.pairwise import pairwise_distances as pwd
import pylab

# compute the distance matrix
D = pwd(X_converted, metric='euclidean')

# compute and plot the first dendrogram
fig = pylab.figure(figsize=(8,8))
ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
Y_sch = sch.linkage(D, method="centroid")
Z1 = sch.dendrogram(Y_sch, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])


# compute the second cluster
ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
Y_sch = sch.linkage(D, method='single')
Z2 = sch.dendrogram(Y_sch)
ax2.set_xticks([])
ax2.set_yticks([])

# plot the distance matrix
axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1, :]
D = D[:, idx2]

im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])

#plot colorbar
axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6])
pylab.colorbar(im, cax=axcolor)
fig.show()
fig.savefig('dendrogram.png')


/Users/shreyas/anaconda/lib/python2.7/site-packages/matplotlib/figure.py:371: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "
<matplotlib.figure.Figure object at 0x1105b9d50>

Supervised Learning

To be able to assess our classifier better, We have resampled our data to include equal number of fair and unfair apps. We will shuffle the records and do 10-fold validation on it.


In [39]:
# first get the feature matrix
# from Luis's random sampling of the dataframe

appData

X_resampled = trimDataFrame(appData)

X_resampled

X_for_classifier =  min_max_scaler.fit_transform(X_resampled).astype('float64')

np.random.shuffle(X_for_classifier)

n_sample, n_features = X_for_classifier.shape

In [40]:
# setting up the cross validation
from sklearn import cross_validation
kf = cross_validation.KFold(n_sample, n_folds=3)

In [40]: