Obidroid Multi Dimensional Scaling (MDS) Notebook

Loading App Data

I shall be using the previously exported exports/appFeatures.csv data for this notebook



In [1110]:

    
%pylab --no-import-all inline









    



Populating the interactive namespace from numpy and matplotlib



In [1111]:

    
%doctest_mode









    



Exception reporting mode: Context
Doctest mode is: OFF



In [1112]:

    
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

## Load the appFeatures file
main_appData = pd.read_csv('exports/appFeatures.csv')



In [1113]:

    
# This cell creates a dataframe with the same amount of unfair and fair apps, where the fair apps are randomly selected
# TODO - Create a method that implements k-fold validation for app selection
import random

## create unfair apps dataframe and count
df_unfair = main_appData[main_appData.appLabel == 'unfair']
unfair_count = df_unfair.appLabel.count()

## get same number of random fair apps
df_randomly_fair = main_appData[main_appData.appLabel == 'fair'].ix[random.sample(main_appData[main_appData.appLabel == 'fair'].index, unfair_count)]
#print df_randomly_fair.appLabel.count()


# append the newly created dataframe of unfair * 2 rows
appData = df_randomly_fair.append(df_unfair)

# shuffle the dataframe
appData = appData.ix[np.random.permutation(appData.index)]

main_appData.columns









    Out[1113]:





Index([u'appName', u'adjectiveCount', u'avgRating', u'countCapital', u'exclamationCount', u'hasDeveloperEmail', u'hasDeveloperWebsite', u'Unnamed: 7', u'hasPrivacy', u'installs', u'price', u'revSent', u'revLength', u'appLabel'], dtype='object')



In [1114]:

    
def trimDataFrame(df):
    """
    Lets create a new dataframe for appFeatures and appLabels
    """

    ## for App Features
    appCols = set(df.columns)
    appCols.remove('appName') # remove app Names column
    appCols.remove('Unnamed: 7') # removing a weird unnamed column
    appCols.remove('appLabel') # removing the label column
    appCols.remove('price') # removing price since most of the apps are free
    appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0
    
    df_trim = df[list(appCols)]
    
    # -- boolean
    df_trim['hasPrivacy'].astype(bool)
    df_trim['hasDeveloperEmail'].astype(bool)
    df_trim['hasDeveloperWebsite'].astype(bool)
    
    # -- integer
    df_trim['adjectiveCount'].astype(int)
    df_trim['countCapital'].astype(int)
    df_trim['installs'].astype(int)
    df_trim['revSent'].astype(int)
    df_trim['revLength'].astype(int)
    
    # -- float
    df_trim['avgRating'].astype(float)
    
    return df_trim

Now, I want to explicitly set types to all my columns as a better practice



In [1115]:

    
# Explicitly casting column types in appFeatures dataframe
appFeatures = trimDataFrame(main_appData)


# -- boolean
appFeatures['hasPrivacy'].astype(bool)
appFeatures['hasDeveloperEmail'].astype(bool)
appFeatures['hasDeveloperWebsite'].astype(bool)

# -- integer
appFeatures['adjectiveCount'].astype(int)
appFeatures['countCapital'].astype(int)
appFeatures['installs'].astype(int)
appFeatures['revSent'].astype(int)
appFeatures['revLength'].astype(int)

# -- float
appFeatures['avgRating'].astype(float)


appFeatures









    Out[1115]:






  
    
      
      adjectiveCount
      hasPrivacy
      revLength
      countCapital
      hasDeveloperWebsite
      installs
      hasDeveloperEmail
      avgRating
      revSent
    
  
  
    
      0 
        4
        True
        601
        1
        True
       30000000
        True
       4.051
       -3
    
    
      1 
       13
        True
       1139
       11
        True
       30000000
        True
       4.351
        2
    
    
      2 
       23
        True
       2223
       20
        True
        3000000
       False
       4.555
       -4
    
    
      3 
       10
       False
        804
        5
        True
       30000000
        True
       4.623
        8
    
    
      4 
       22
        True
       1867
       16
        True
        7500000
       False
       4.046
      -11
    
    
      5 
       18
       False
       1162
        6
        True
       30000000
        True
       4.595
        1
    
    
      6 
       18
        True
       1522
       60
        True
       30000000
        True
       4.526
       -4
    
    
      7 
       13
       False
       1895
       19
        True
       30000000
       False
       4.039
       -5
    
    
      8 
       11
        True
       1195
       10
        True
        3000000
        True
       4.400
       -2
    
    
      9 
       19
        True
       1488
       11
        True
         300000
        True
       3.935
       -4
    
    
      10
       18
       False
       1864
       35
        True
        3000000
        True
       4.075
       -5
    
    
      11
       19
       False
       2049
       14
        True
         750000
       False
       3.983
       -2
    
    
      12
        8
       False
        417
        2
        True
       30000000
        True
       4.238
        1
    
    
      13
       16
       False
       1276
       11
        True
        3000000
        True
       3.915
       -3
    
    
      14
       13
       False
       1210
       12
        True
         750000
        True
       4.050
       -3
    
    
      15
       20
        True
       2038
       24
        True
         750000
        True
       3.795
       -7
    
    
      16
       12
       False
       1044
       10
        True
        7500000
        True
       3.997
        1
    
    
      17
       15
        True
       1245
       13
        True
        3000000
        True
       3.212
       -5
    
    
      18
        2
       False
        225
        2
        True
        3000000
        True
       2.611
       -1
    
    
      19
       15
       False
       1120
       10
        True
       30000000
        True
       4.547
       -3
    
    
      20
       22
        True
       1406
       11
        True
        3000000
        True
       2.671
        4
    
    
      21
       13
        True
       1063
       10
        True
         750000
       False
       4.045
       -3
    
    
      22
        7
       False
        855
       10
        True
         750000
        True
       3.555
       -9
    
    
      23
       17
        True
       2147
       38
        True
        3000000
        True
       4.590
       -5
    
    
      24
       12
        True
       1189
       13
        True
        7500000
        True
       4.258
       -9
    
    
      25
       19
       False
       1804
        8
        True
         750000
        True
       4.428
      -10
    
    
      26
       26
       False
       1514
        5
        True
        3000000
        True
       4.401
        5
    
    
      27
       12
        True
       1272
       12
        True
        3000000
        True
       4.275
       -6
    
    
      28
       17
        True
       1413
       14
        True
         750000
        True
       4.149
       -8
    
    
      29
        7
        True
        610
        2
        True
         300000
        True
       4.396
       -3
    
    
      30
       14
        True
       1145
        5
        True
        3000000
        True
       4.113
        2
    
    
      31
       14
       False
       1413
       14
        True
          30000
        True
       4.240
        2
    
    
      32
        2
        True
        573
       14
        True
         300000
        True
       4.241
       -4
    
    
      33
       12
        True
       1387
       16
        True
        3000000
        True
       3.989
       -6
    
    
      34
       19
        True
       1336
        7
        True
          75000
        True
       4.310
       -7
    
    
      35
        7
        True
        817
        5
        True
        3000000
        True
       4.451
        5
    
    
      36
       29
        True
       2205
       16
        True
         300000
        True
       3.916
       -9
    
    
      37
        7
       False
        541
        3
        True
         300000
        True
       4.761
        1
    
    
      38
        6
       False
        310
        1
        True
         300000
        True
       4.158
        0
    
    
      39
        6
       False
        880
        4
        True
         300000
        True
       2.972
       -4
    
    
      40
        5
       False
        583
        4
        True
         300000
        True
       3.903
       -6
    
    
      41
       19
       False
       1888
       20
        True
        3000000
        True
       3.433
       -9
    
    
      42
       13
       False
       1122
        4
        True
        3000000
        True
       4.412
       -3
    
    
      43
       15
        True
       1613
       21
        True
        3000000
       False
       4.461
       15
    
    
      44
        5
       False
       1014
       11
       False
         300000
        True
       3.564
       -2
    
    
      45
        8
       False
       1115
        6
       False
        3000000
        True
       4.131
        5
    
    
      46
        6
        True
        437
        8
        True
          30000
        True
       3.550
       -1
    
    
      47
        2
        True
        307
        0
        True
        3000000
        True
       4.435
        1
    
    
      48
       13
        True
        991
        4
        True
         300000
        True
       4.233
       -3
    
    
      49
        7
       False
        578
        6
        True
        3000000
        True
       3.975
        1
    
    
      50
       10
        True
        962
        7
        True
         300000
        True
       3.926
        4
    
    
      51
       13
       False
       1267
        6
        True
        3000000
        True
       4.590
        2
    
    
      52
        9
       False
       1300
        8
        True
         300000
        True
       3.601
       -9
    
    
      53
       16
       False
       1051
       12
        True
         300000
        True
       3.701
        3
    
    
      54
       16
        True
       1822
       10
        True
         300000
        True
       2.931
      -13
    
    
      55
        7
       False
        535
        0
        True
        3000000
        True
       4.564
        0
    
    
      56
       11
       False
       1075
        8
        True
        7500000
        True
       4.179
       -7
    
    
      57
        6
        True
        691
        4
        True
        3000000
        True
       4.466
        7
    
    
      58
        7
        True
        991
        5
        True
         750000
       False
       4.340
       -3
    
    
      59
        7
       False
        805
       14
        True
         300000
        True
       4.539
        3
    
    
      
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
  

323 rows × 9 columns

Controls



In [1115]:

Unsupervised Learning

Scaling the feature vector



In [1116]:

    
# get all the rows for unsupervised learning
appData_all = trimDataFrame(main_appData)

appData_all.head()









    Out[1116]:






  
    
      
      adjectiveCount
      hasPrivacy
      revLength
      countCapital
      hasDeveloperWebsite
      installs
      hasDeveloperEmail
      avgRating
      revSent
    
  
  
    
      0
        4
        True
        601
        1
       True
       30000000
        True
       4.051
       -3
    
    
      1
       13
        True
       1139
       11
       True
       30000000
        True
       4.351
        2
    
    
      2
       23
        True
       2223
       20
       True
        3000000
       False
       4.555
       -4
    
    
      3
       10
       False
        804
        5
       True
       30000000
        True
       4.623
        8
    
    
      4
       22
        True
       1867
       16
       True
        7500000
       False
       4.046
      -11
    
  

5 rows × 9 columns



In [1117]:

    
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

print(appData_all.columns)
#print appData_all

# scale the dataframe
X_scaled = min_max_scaler.fit_transform(appData_all)









    



Index([u'adjectiveCount', u'hasPrivacy', u'revLength', u'countCapital', u'hasDeveloperWebsite', u'installs', u'hasDeveloperEmail', u'avgRating', u'revSent'], dtype='object')



In [1118]:

    
from time import time
import pylab as pl
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
                     random_projection)
from mpld3 import enable_notebook
from mpld3 import plugins
enable_notebook()



In [1119]:

    
# Plot D3 scatter
def plot_d3(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)
    
    fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
    fig.set_figwidth(12)
    fig.set_figheight(8)

    scatter = ax.scatter(X[:,0],
                         X[:,1],
                         c=['red' if main_appData.iloc[i]['appLabel'] == 'unfair' else 'green' for i in range(X.shape[0])],
                         s=75,
                         alpha=0.2,
                         cmap=plt.cm.jet)
    ax.grid(color='white', linestyle='solid')

    ax.set_title("Scatter Plot of Unfair/Fair Apps", size=24)

    labels = [main_appData.iloc[i]['appName'].decode('ascii', 'replace') for i in range(X.shape[0])]
    tooltip = plugins.PointLabelTooltip(scatter, labels=labels)
    plugins.connect(fig, tooltip)



In [1120]:

    
print("Computing MDS embedding and plotting with D3")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100, verbose=1)
t0 = time()

X_mds = clf.fit_transform(X_scaled)
print("Done. Stress: %f" % clf.stress_)
plot_d3(X_mds,
               "MDS embedding of the digits (time %.2fs)" %
               (time() - t0))









    



Computing MDS embedding and plotting with D3
Done. Stress: 546.646566



In [1121]:

    
# Change plot string for fair and unfair apps using the main_appdata dataframe set in a previous cell
def set_plot_symbol(idx, fair_char = '.', unfair_char = '+'):
    symbol = ''
    if main_appData.iloc[idx]['appLabel'] == 'unfair':
        symbol = str(idx)
    elif main_appData.iloc[idx]['appLabel'] == 'fair':
        symbol = fair_char
    
    return symbol



In [1122]:

    
# Code reference: http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html#example-manifold-plot-lle-digits-py
#====================
#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)
    
    #pl.figure(figsize=(8, 6))
    pl.figure()
    ax = pl.subplot(1, 1, 1)
    for i in range(X.shape[0]):
        pl.text(X[i, 0], X[i, 1], str(set_plot_symbol(i)),
                #color=pl.cm.Set1(y[i] / 2.),
                fontdict={'weight': 'bold', 'size': 12})


    pl.xticks([]), pl.yticks([])
    if title is not None:
        pl.title(title)



In [1123]:

    
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100, verbose=1)
t0 = time()

X_mds = clf.fit_transform(X_scaled)
print("Done. Stress: %f" % clf.stress_)
#plot_embedding(X_mds,
#               "MDS embedding of the digits (time %.2fs)" %
#               (time() - t0))

#print clf.get_params(True)









    



Computing MDS embedding
breaking at iteration 83 with stress 784.894512328
Done. Stress: 784.894512



In [1124]:

    
#----------------------------------------------------------------------
# MDS  embedding of the digits dataset
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Stress: %f" % clf.stress_)
#plot_embedding(X_mds,
#               "MDS embedding of the digits (time %.2fs)" %
#               (time() - t0))









    



Computing MDS embedding
Done. Stress: 814.870262

	adjectiveCount	hasPrivacy	revLength	countCapital	hasDeveloperWebsite	installs	hasDeveloperEmail	avgRating	revSent
0	4	True	601	1	True	30000000	True	4.051	-3
1	13	True	1139	11	True	30000000	True	4.351	2
2	23	True	2223	20	True	3000000	False	4.555	-4
3	10	False	804	5	True	30000000	True	4.623	8
4	22	True	1867	16	True	7500000	False	4.046	-11
5	18	False	1162	6	True	30000000	True	4.595	1
6	18	True	1522	60	True	30000000	True	4.526	-4
7	13	False	1895	19	True	30000000	False	4.039	-5
8	11	True	1195	10	True	3000000	True	4.400	-2
9	19	True	1488	11	True	300000	True	3.935	-4
10	18	False	1864	35	True	3000000	True	4.075	-5
11	19	False	2049	14	True	750000	False	3.983	-2
12	8	False	417	2	True	30000000	True	4.238	1
13	16	False	1276	11	True	3000000	True	3.915	-3
14	13	False	1210	12	True	750000	True	4.050	-3
15	20	True	2038	24	True	750000	True	3.795	-7
16	12	False	1044	10	True	7500000	True	3.997	1
17	15	True	1245	13	True	3000000	True	3.212	-5
18	2	False	225	2	True	3000000	True	2.611	-1
19	15	False	1120	10	True	30000000	True	4.547	-3
20	22	True	1406	11	True	3000000	True	2.671	4
21	13	True	1063	10	True	750000	False	4.045	-3
22	7	False	855	10	True	750000	True	3.555	-9
23	17	True	2147	38	True	3000000	True	4.590	-5
24	12	True	1189	13	True	7500000	True	4.258	-9
25	19	False	1804	8	True	750000	True	4.428	-10
26	26	False	1514	5	True	3000000	True	4.401	5
27	12	True	1272	12	True	3000000	True	4.275	-6
28	17	True	1413	14	True	750000	True	4.149	-8
29	7	True	610	2	True	300000	True	4.396	-3
30	14	True	1145	5	True	3000000	True	4.113	2
31	14	False	1413	14	True	30000	True	4.240	2
32	2	True	573	14	True	300000	True	4.241	-4
33	12	True	1387	16	True	3000000	True	3.989	-6
34	19	True	1336	7	True	75000	True	4.310	-7
35	7	True	817	5	True	3000000	True	4.451	5
36	29	True	2205	16	True	300000	True	3.916	-9
37	7	False	541	3	True	300000	True	4.761	1
38	6	False	310	1	True	300000	True	4.158	0
39	6	False	880	4	True	300000	True	2.972	-4
40	5	False	583	4	True	300000	True	3.903	-6
41	19	False	1888	20	True	3000000	True	3.433	-9
42	13	False	1122	4	True	3000000	True	4.412	-3
43	15	True	1613	21	True	3000000	False	4.461	15
44	5	False	1014	11	False	300000	True	3.564	-2
45	8	False	1115	6	False	3000000	True	4.131	5
46	6	True	437	8	True	30000	True	3.550	-1
47	2	True	307	0	True	3000000	True	4.435	1
48	13	True	991	4	True	300000	True	4.233	-3
49	7	False	578	6	True	3000000	True	3.975	1
50	10	True	962	7	True	300000	True	3.926	4
51	13	False	1267	6	True	3000000	True	4.590	2
52	9	False	1300	8	True	300000	True	3.601	-9
53	16	False	1051	12	True	300000	True	3.701	3
54	16	True	1822	10	True	300000	True	2.931	-13
55	7	False	535	0	True	3000000	True	4.564	0
56	11	False	1075	8	True	7500000	True	4.179	-7
57	6	True	691	4	True	3000000	True	4.466	7
58	7	True	991	5	True	750000	False	4.340	-3
59	7	False	805	14	True	300000	True	4.539	3
	...	...	...	...	...	...	...	...	...