Obidroid Multi Dimensional Scaling (MDS) Notebook

Loading App Data

I shall be using the previously exported exports/appFeatures.csv data for this notebook


In [1110]:
%pylab --no-import-all inline


Populating the interactive namespace from numpy and matplotlib

In [1111]:
%doctest_mode


Exception reporting mode: Context
Doctest mode is: OFF

In [1112]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

## Load the appFeatures file
main_appData = pd.read_csv('exports/appFeatures.csv')

In [1113]:
# This cell creates a dataframe with the same amount of unfair and fair apps, where the fair apps are randomly selected
# TODO - Create a method that implements k-fold validation for app selection
import random

## create unfair apps dataframe and count
df_unfair = main_appData[main_appData.appLabel == 'unfair']
unfair_count = df_unfair.appLabel.count()

## get same number of random fair apps
df_randomly_fair = main_appData[main_appData.appLabel == 'fair'].ix[random.sample(main_appData[main_appData.appLabel == 'fair'].index, unfair_count)]
#print df_randomly_fair.appLabel.count()


# append the newly created dataframe of unfair * 2 rows
appData = df_randomly_fair.append(df_unfair)

# shuffle the dataframe
appData = appData.ix[np.random.permutation(appData.index)]

main_appData.columns


Out[1113]:
Index([u'appName', u'adjectiveCount', u'avgRating', u'countCapital', u'exclamationCount', u'hasDeveloperEmail', u'hasDeveloperWebsite', u'Unnamed: 7', u'hasPrivacy', u'installs', u'price', u'revSent', u'revLength', u'appLabel'], dtype='object')

In [1114]:
def trimDataFrame(df):
    """
    Lets create a new dataframe for appFeatures and appLabels
    """

    ## for App Features
    appCols = set(df.columns)
    appCols.remove('appName') # remove app Names column
    appCols.remove('Unnamed: 7') # removing a weird unnamed column
    appCols.remove('appLabel') # removing the label column
    appCols.remove('price') # removing price since most of the apps are free
    appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0
    
    df_trim = df[list(appCols)]
    
    # -- boolean
    df_trim['hasPrivacy'].astype(bool)
    df_trim['hasDeveloperEmail'].astype(bool)
    df_trim['hasDeveloperWebsite'].astype(bool)
    
    # -- integer
    df_trim['adjectiveCount'].astype(int)
    df_trim['countCapital'].astype(int)
    df_trim['installs'].astype(int)
    df_trim['revSent'].astype(int)
    df_trim['revLength'].astype(int)
    
    # -- float
    df_trim['avgRating'].astype(float)
    
    return df_trim

Now, I want to explicitly set types to all my columns as a better practice


In [1115]:
# Explicitly casting column types in appFeatures dataframe
appFeatures = trimDataFrame(main_appData)


# -- boolean
appFeatures['hasPrivacy'].astype(bool)
appFeatures['hasDeveloperEmail'].astype(bool)
appFeatures['hasDeveloperWebsite'].astype(bool)

# -- integer
appFeatures['adjectiveCount'].astype(int)
appFeatures['countCapital'].astype(int)
appFeatures['installs'].astype(int)
appFeatures['revSent'].astype(int)
appFeatures['revLength'].astype(int)

# -- float
appFeatures['avgRating'].astype(float)


appFeatures


Out[1115]:
adjectiveCount hasPrivacy revLength countCapital hasDeveloperWebsite installs hasDeveloperEmail avgRating revSent
0 4 True 601 1 True 30000000 True 4.051 -3
1 13 True 1139 11 True 30000000 True 4.351 2
2 23 True 2223 20 True 3000000 False 4.555 -4
3 10 False 804 5 True 30000000 True 4.623 8
4 22 True 1867 16 True 7500000 False 4.046 -11
5 18 False 1162 6 True 30000000 True 4.595 1
6 18 True 1522 60 True 30000000 True 4.526 -4
7 13 False 1895 19 True 30000000 False 4.039 -5
8 11 True 1195 10 True 3000000 True 4.400 -2
9 19 True 1488 11 True 300000 True 3.935 -4
10 18 False 1864 35 True 3000000 True 4.075 -5
11 19 False 2049 14 True 750000 False 3.983 -2
12 8 False 417 2 True 30000000 True 4.238 1
13 16 False 1276 11 True 3000000 True 3.915 -3
14 13 False 1210 12 True 750000 True 4.050 -3
15 20 True 2038 24 True 750000 True 3.795 -7
16 12 False 1044 10 True 7500000 True 3.997 1
17 15 True 1245 13 True 3000000 True 3.212 -5
18 2 False 225 2 True 3000000 True 2.611 -1
19 15 False 1120 10 True 30000000 True 4.547 -3
20 22 True 1406 11 True 3000000 True 2.671 4
21 13 True 1063 10 True 750000 False 4.045 -3
22 7 False 855 10 True 750000 True 3.555 -9
23 17 True 2147 38 True 3000000 True 4.590 -5
24 12 True 1189 13 True 7500000 True 4.258 -9
25 19 False 1804 8 True 750000 True 4.428 -10
26 26 False 1514 5 True 3000000 True 4.401 5
27 12 True 1272 12 True 3000000 True 4.275 -6
28 17 True 1413 14 True 750000 True 4.149 -8
29 7 True 610 2 True 300000 True 4.396 -3
30 14 True 1145 5 True 3000000 True 4.113 2
31 14 False 1413 14 True 30000 True 4.240 2
32 2 True 573 14 True 300000 True 4.241 -4
33 12 True 1387 16 True 3000000 True 3.989 -6
34 19 True 1336 7 True 75000 True 4.310 -7
35 7 True 817 5 True 3000000 True 4.451 5
36 29 True 2205 16 True 300000 True 3.916 -9
37 7 False 541 3 True 300000 True 4.761 1
38 6 False 310 1 True 300000 True 4.158 0
39 6 False 880 4 True 300000 True 2.972 -4
40 5 False 583 4 True 300000 True 3.903 -6
41 19 False 1888 20 True 3000000 True 3.433 -9
42 13 False 1122 4 True 3000000 True 4.412 -3
43 15 True 1613 21 True 3000000 False 4.461 15
44 5 False 1014 11 False 300000 True 3.564 -2
45 8 False 1115 6 False 3000000 True 4.131 5
46 6 True 437 8 True 30000 True 3.550 -1
47 2 True 307 0 True 3000000 True 4.435 1
48 13 True 991 4 True 300000 True 4.233 -3
49 7 False 578 6 True 3000000 True 3.975 1
50 10 True 962 7 True 300000 True 3.926 4
51 13 False 1267 6 True 3000000 True 4.590 2
52 9 False 1300 8 True 300000 True 3.601 -9
53 16 False 1051 12 True 300000 True 3.701 3
54 16 True 1822 10 True 300000 True 2.931 -13
55 7 False 535 0 True 3000000 True 4.564 0
56 11 False 1075 8 True 7500000 True 4.179 -7
57 6 True 691 4 True 3000000 True 4.466 7
58 7 True 991 5 True 750000 False 4.340 -3
59 7 False 805 14 True 300000 True 4.539 3
... ... ... ... ... ... ... ... ...

323 rows × 9 columns


Controls


In [1115]:


Unsupervised Learning

Scaling the feature vector


In [1116]:
# get all the rows for unsupervised learning
appData_all = trimDataFrame(main_appData)

appData_all.head()


Out[1116]:
adjectiveCount hasPrivacy revLength countCapital hasDeveloperWebsite installs hasDeveloperEmail avgRating revSent
0 4 True 601 1 True 30000000 True 4.051 -3
1 13 True 1139 11 True 30000000 True 4.351 2
2 23 True 2223 20 True 3000000 False 4.555 -4
3 10 False 804 5 True 30000000 True 4.623 8
4 22 True 1867 16 True 7500000 False 4.046 -11

5 rows × 9 columns


In [1117]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

print(appData_all.columns)
#print appData_all

# scale the dataframe
X_scaled = min_max_scaler.fit_transform(appData_all)


Index([u'adjectiveCount', u'hasPrivacy', u'revLength', u'countCapital', u'hasDeveloperWebsite', u'installs', u'hasDeveloperEmail', u'avgRating', u'revSent'], dtype='object')

In [1118]:
from time import time
import pylab as pl
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
                     random_projection)
from mpld3 import enable_notebook
from mpld3 import plugins
enable_notebook()

In [1119]:
# Plot D3 scatter
def plot_d3(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)
    
    fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
    fig.set_figwidth(12)
    fig.set_figheight(8)

    scatter = ax.scatter(X[:,0],
                         X[:,1],
                         c=['red' if main_appData.iloc[i]['appLabel'] == 'unfair' else 'green' for i in range(X.shape[0])],
                         s=75,
                         alpha=0.2,
                         cmap=plt.cm.jet)
    ax.grid(color='white', linestyle='solid')

    ax.set_title("Scatter Plot of Unfair/Fair Apps", size=24)

    labels = [main_appData.iloc[i]['appName'].decode('ascii', 'replace') for i in range(X.shape[0])]
    tooltip = plugins.PointLabelTooltip(scatter, labels=labels)
    plugins.connect(fig, tooltip)

In [1120]:
print("Computing MDS embedding and plotting with D3")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100, verbose=1)
t0 = time()

X_mds = clf.fit_transform(X_scaled)
print("Done. Stress: %f" % clf.stress_)
plot_d3(X_mds,
               "MDS embedding of the digits (time %.2fs)" %
               (time() - t0))


Computing MDS embedding and plotting with D3
Done. Stress: 546.646566

In [1121]:
# Change plot string for fair and unfair apps using the main_appdata dataframe set in a previous cell
def set_plot_symbol(idx, fair_char = '.', unfair_char = '+'):
    symbol = ''
    if main_appData.iloc[idx]['appLabel'] == 'unfair':
        symbol = str(idx)
    elif main_appData.iloc[idx]['appLabel'] == 'fair':
        symbol = fair_char
    
    return symbol

In [1122]:
# Code reference: http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html#example-manifold-plot-lle-digits-py
#====================
#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)
    
    #pl.figure(figsize=(8, 6))
    pl.figure()
    ax = pl.subplot(1, 1, 1)
    for i in range(X.shape[0]):
        pl.text(X[i, 0], X[i, 1], str(set_plot_symbol(i)),
                #color=pl.cm.Set1(y[i] / 2.),
                fontdict={'weight': 'bold', 'size': 12})


    pl.xticks([]), pl.yticks([])
    if title is not None:
        pl.title(title)

In [1123]:
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100, verbose=1)
t0 = time()

X_mds = clf.fit_transform(X_scaled)
print("Done. Stress: %f" % clf.stress_)
#plot_embedding(X_mds,
#               "MDS embedding of the digits (time %.2fs)" %
#               (time() - t0))

#print clf.get_params(True)


Computing MDS embedding
breaking at iteration 83 with stress 784.894512328
Done. Stress: 784.894512

In [1124]:
#----------------------------------------------------------------------
# MDS  embedding of the digits dataset
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Stress: %f" % clf.stress_)
#plot_embedding(X_mds,
#               "MDS embedding of the digits (time %.2fs)" %
#               (time() - t0))


Computing MDS embedding
Done. Stress: 814.870262