Obidroid Multi Dimensional Scaling (MDS) Notebook

Loading App Data

I shall be using the previously exported exports/appFeatures.csv data for this notebook


In [1]:
%pylab --no-import-all inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
%doctest_mode


Exception reporting mode: Plain
Doctest mode is: ON

In [3]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

## Load the appFeatures file
main_appData = pd.read_csv('exports/appFeatures.csv')

In [4]:
# This cell creates a dataframe with the same amount of unfair and fair apps, where the fair apps are randomly selected
# TODO - Create a method that implements k-fold validation for app selection
import random

## create unfair apps dataframe and count
df_unfair = main_appData[main_appData.appLabel == 'unfair']
unfair_count = df_unfair.appLabel.count()

## get same number of random fair apps
df_randomly_fair = main_appData[main_appData.appLabel == 'fair'].ix[random.sample(main_appData[main_appData.appLabel == 'fair'].index, unfair_count)]
#print df_randomly_fair.appLabel.count()


# append the newly created dataframe of unfair * 2 rows
appData = df_randomly_fair.append(df_unfair)

# shuffle the dataframe
appData = appData.ix[np.random.permutation(appData.index)]

main_appData.columns


Out[4]:
Index([u'appName', u'adjectiveCount', u'avgRating', u'countCapital', u'exclamationCount', u'hasDeveloperEmail', u'hasDeveloperWebsite', u'Unnamed: 7', u'hasPrivacy', u'installs', u'price', u'revSent', u'revLength', u'appLabel'], dtype='object')

In [5]:
def trimDataFrame(df):
    """
    Lets create a new dataframe for appFeatures and appLabels
    """

    ## for App Features
    appCols = set(df.columns)
    appCols.remove('appName') # remove app Names column
    appCols.remove('Unnamed: 7') # removing a weird unnamed column
    appCols.remove('appLabel') # removing the label column
    appCols.remove('price') # removing price since most of the apps are free
    appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0
    
    df_trim = df[list(appCols)]
    
    # -- boolean
    df_trim['hasPrivacy'].astype(bool)
    df_trim['hasDeveloperEmail'].astype(bool)
    df_trim['hasDeveloperWebsite'].astype(bool)
    
    # -- integer
    df_trim['adjectiveCount'].astype(int)
    df_trim['countCapital'].astype(int)
    df_trim['installs'].astype(int)
    df_trim['revSent'].astype(int)
    df_trim['revLength'].astype(int)
    
    # -- float
    df_trim['avgRating'].astype(float)
    
    return df_trim

Now, I want to explicitly set types to all my columns as a better practice


In [6]:
# Explicitly casting column types in appFeatures dataframe
appFeatures = trimDataFrame(main_appData)


# -- boolean
appFeatures['hasPrivacy'].astype(bool)
appFeatures['hasDeveloperEmail'].astype(bool)
appFeatures['hasDeveloperWebsite'].astype(bool)

# -- integer
appFeatures['adjectiveCount'].astype(int)
appFeatures['countCapital'].astype(int)
appFeatures['installs'].astype(int)
appFeatures['revSent'].astype(int)
appFeatures['revLength'].astype(int)

# -- float
appFeatures['avgRating'].astype(float)


appFeatures


Out[6]:
    adjectiveCount hasPrivacy  revLength  countCapital hasDeveloperWebsite  \
0                4       True        601             1                True   
1               13       True       1139            11                True   
2               23       True       2223            20                True   
3               10      False        804             5                True   
4               22       True       1867            16                True   
5               18      False       1162             6                True   
6               18       True       1522            60                True   
7               13      False       1895            19                True   
8               11       True       1195            10                True   
9               19       True       1488            11                True   
10              18      False       1864            35                True   
11              19      False       2049            14                True   
12               8      False        417             2                True   
13              16      False       1276            11                True   
14              13      False       1210            12                True   
15              20       True       2038            24                True   
16              12      False       1044            10                True   
17              15       True       1245            13                True   
18               2      False        225             2                True   
19              15      False       1120            10                True   
20              22       True       1406            11                True   
21              13       True       1063            10                True   
22               7      False        855            10                True   
23              17       True       2147            38                True   
24              12       True       1189            13                True   
25              19      False       1804             8                True   
26              26      False       1514             5                True   
27              12       True       1272            12                True   
28              17       True       1413            14                True   
29               7       True        610             2                True   
30              14       True       1145             5                True   
31              14      False       1413            14                True   
32               2       True        573            14                True   
33              12       True       1387            16                True   
34              19       True       1336             7                True   
35               7       True        817             5                True   
36              29       True       2205            16                True   
37               7      False        541             3                True   
38               6      False        310             1                True   
39               6      False        880             4                True   
40               5      False        583             4                True   
41              19      False       1888            20                True   
42              13      False       1122             4                True   
43              15       True       1613            21                True   
44               5      False       1014            11               False   
45               8      False       1115             6               False   
46               6       True        437             8                True   
47               2       True        307             0                True   
48              13       True        991             4                True   
49               7      False        578             6                True   
50              10       True        962             7                True   
51              13      False       1267             6                True   
52               9      False       1300             8                True   
53              16      False       1051            12                True   
54              16       True       1822            10                True   
55               7      False        535             0                True   
56              11      False       1075             8                True   
57               6       True        691             4                True   
58               7       True        991             5                True   
59               7      False        805            14                True   
               ...        ...        ...           ...                 ...   

    installs hasDeveloperEmail  avgRating  revSent  
0   30000000              True      4.051       -3  
1   30000000              True      4.351        2  
2    3000000             False      4.555       -4  
3   30000000              True      4.623        8  
4    7500000             False      4.046      -11  
5   30000000              True      4.595        1  
6   30000000              True      4.526       -4  
7   30000000             False      4.039       -5  
8    3000000              True      4.400       -2  
9     300000              True      3.935       -4  
10   3000000              True      4.075       -5  
11    750000             False      3.983       -2  
12  30000000              True      4.238        1  
13   3000000              True      3.915       -3  
14    750000              True      4.050       -3  
15    750000              True      3.795       -7  
16   7500000              True      3.997        1  
17   3000000              True      3.212       -5  
18   3000000              True      2.611       -1  
19  30000000              True      4.547       -3  
20   3000000              True      2.671        4  
21    750000             False      4.045       -3  
22    750000              True      3.555       -9  
23   3000000              True      4.590       -5  
24   7500000              True      4.258       -9  
25    750000              True      4.428      -10  
26   3000000              True      4.401        5  
27   3000000              True      4.275       -6  
28    750000              True      4.149       -8  
29    300000              True      4.396       -3  
30   3000000              True      4.113        2  
31     30000              True      4.240        2  
32    300000              True      4.241       -4  
33   3000000              True      3.989       -6  
34     75000              True      4.310       -7  
35   3000000              True      4.451        5  
36    300000              True      3.916       -9  
37    300000              True      4.761        1  
38    300000              True      4.158        0  
39    300000              True      2.972       -4  
40    300000              True      3.903       -6  
41   3000000              True      3.433       -9  
42   3000000              True      4.412       -3  
43   3000000             False      4.461       15  
44    300000              True      3.564       -2  
45   3000000              True      4.131        5  
46     30000              True      3.550       -1  
47   3000000              True      4.435        1  
48    300000              True      4.233       -3  
49   3000000              True      3.975        1  
50    300000              True      3.926        4  
51   3000000              True      4.590        2  
52    300000              True      3.601       -9  
53    300000              True      3.701        3  
54    300000              True      2.931      -13  
55   3000000              True      4.564        0  
56   7500000              True      4.179       -7  
57   3000000              True      4.466        7  
58    750000             False      4.340       -3  
59    300000              True      4.539        3  
         ...               ...        ...      ...  

[323 rows x 9 columns]

Unsupervised Learning

Scaling the feature vector


In [7]:
# get all the rows for unsupervised learning
appData_all = trimDataFrame(main_appData)

appData_all.head()


Out[7]:
   adjectiveCount hasPrivacy  revLength  countCapital hasDeveloperWebsite  \
0               4       True        601             1                True   
1              13       True       1139            11                True   
2              23       True       2223            20                True   
3              10      False        804             5                True   
4              22       True       1867            16                True   

   installs hasDeveloperEmail  avgRating  revSent  
0  30000000              True      4.051       -3  
1  30000000              True      4.351        2  
2   3000000             False      4.555       -4  
3  30000000              True      4.623        8  
4   7500000             False      4.046      -11  

[5 rows x 9 columns]

In [17]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

print(appData_all.columns)
#print appData_all

# scale the dataframe
X_scaled = min_max_scaler.fit_transform(appData_all)


Index([u'adjectiveCount', u'hasPrivacy', u'revLength', u'countCapital', u'hasDeveloperWebsite', u'installs', u'hasDeveloperEmail', u'avgRating', u'revSent'], dtype='object')

In [18]:
from time import time
import pylab as pl
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
                     random_projection)
from mpld3 import enable_notebook
from mpld3 import plugins
enable_notebook()

In [19]:
# Plot D3 scatter
def plot_d3(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)
    
    fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
    fig.set_figwidth(12)
    fig.set_figheight(8)

    scatter = ax.scatter(X[:,0],
                         X[:,1],
                         c=['red' if main_appData.iloc[i]['appLabel'] == 'unfair' else 'green' for i in range(X.shape[0])],
                         s=75,
                         alpha=0.2,
                         cmap=plt.cm.jet)
    ax.grid(color='white', linestyle='solid')

    ax.set_title("Scatter Plot of Unfair/Fair Apps", size=24)

    labels = [main_appData.iloc[i]['appName'].decode('ascii', 'replace') for i in range(X.shape[0])]
    tooltip = plugins.PointLabelTooltip(scatter, labels=labels)
    plugins.connect(fig, tooltip)

In [20]:
print("Computing MDS embedding and plotting with D3")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100, verbose=1)
t0 = time()

X_mds = clf.fit_transform(X_scaled)
print("Done. Stress: %f" % clf.stress_)
plot_d3(X_mds,
               "MDS embedding of the digits (time %.2fs)" %
               (time() - t0))


Computing MDS embedding and plotting with D3
Done. Stress: 683.029781
<matplotlib.figure.Figure object at 0x103801290>

In [ ]: