I shall be using the previously exported exports/appFeatures.csv
data for this notebook
In [1110]:
%pylab --no-import-all inline
In [1111]:
%doctest_mode
In [1112]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
## Load the appFeatures file
main_appData = pd.read_csv('exports/appFeatures.csv')
In [1113]:
# This cell creates a dataframe with the same amount of unfair and fair apps, where the fair apps are randomly selected
# TODO - Create a method that implements k-fold validation for app selection
import random
## create unfair apps dataframe and count
df_unfair = main_appData[main_appData.appLabel == 'unfair']
unfair_count = df_unfair.appLabel.count()
## get same number of random fair apps
df_randomly_fair = main_appData[main_appData.appLabel == 'fair'].ix[random.sample(main_appData[main_appData.appLabel == 'fair'].index, unfair_count)]
#print df_randomly_fair.appLabel.count()
# append the newly created dataframe of unfair * 2 rows
appData = df_randomly_fair.append(df_unfair)
# shuffle the dataframe
appData = appData.ix[np.random.permutation(appData.index)]
main_appData.columns
Out[1113]:
In [1114]:
def trimDataFrame(df):
"""
Lets create a new dataframe for appFeatures and appLabels
"""
## for App Features
appCols = set(df.columns)
appCols.remove('appName') # remove app Names column
appCols.remove('Unnamed: 7') # removing a weird unnamed column
appCols.remove('appLabel') # removing the label column
appCols.remove('price') # removing price since most of the apps are free
appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0
df_trim = df[list(appCols)]
# -- boolean
df_trim['hasPrivacy'].astype(bool)
df_trim['hasDeveloperEmail'].astype(bool)
df_trim['hasDeveloperWebsite'].astype(bool)
# -- integer
df_trim['adjectiveCount'].astype(int)
df_trim['countCapital'].astype(int)
df_trim['installs'].astype(int)
df_trim['revSent'].astype(int)
df_trim['revLength'].astype(int)
# -- float
df_trim['avgRating'].astype(float)
return df_trim
Now, I want to explicitly set types to all my columns as a better practice
In [1115]:
# Explicitly casting column types in appFeatures dataframe
appFeatures = trimDataFrame(main_appData)
# -- boolean
appFeatures['hasPrivacy'].astype(bool)
appFeatures['hasDeveloperEmail'].astype(bool)
appFeatures['hasDeveloperWebsite'].astype(bool)
# -- integer
appFeatures['adjectiveCount'].astype(int)
appFeatures['countCapital'].astype(int)
appFeatures['installs'].astype(int)
appFeatures['revSent'].astype(int)
appFeatures['revLength'].astype(int)
# -- float
appFeatures['avgRating'].astype(float)
appFeatures
Out[1115]:
In [1115]:
In [1116]:
# get all the rows for unsupervised learning
appData_all = trimDataFrame(main_appData)
appData_all.head()
Out[1116]:
In [1117]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
print(appData_all.columns)
#print appData_all
# scale the dataframe
X_scaled = min_max_scaler.fit_transform(appData_all)
In [1118]:
from time import time
import pylab as pl
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
random_projection)
from mpld3 import enable_notebook
from mpld3 import plugins
enable_notebook()
In [1119]:
# Plot D3 scatter
def plot_d3(X, title=None):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X - x_min) / (x_max - x_min)
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
fig.set_figwidth(12)
fig.set_figheight(8)
scatter = ax.scatter(X[:,0],
X[:,1],
c=['red' if main_appData.iloc[i]['appLabel'] == 'unfair' else 'green' for i in range(X.shape[0])],
s=75,
alpha=0.2,
cmap=plt.cm.jet)
ax.grid(color='white', linestyle='solid')
ax.set_title("Scatter Plot of Unfair/Fair Apps", size=24)
labels = [main_appData.iloc[i]['appName'].decode('ascii', 'replace') for i in range(X.shape[0])]
tooltip = plugins.PointLabelTooltip(scatter, labels=labels)
plugins.connect(fig, tooltip)
In [1120]:
print("Computing MDS embedding and plotting with D3")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100, verbose=1)
t0 = time()
X_mds = clf.fit_transform(X_scaled)
print("Done. Stress: %f" % clf.stress_)
plot_d3(X_mds,
"MDS embedding of the digits (time %.2fs)" %
(time() - t0))
In [1121]:
# Change plot string for fair and unfair apps using the main_appdata dataframe set in a previous cell
def set_plot_symbol(idx, fair_char = '.', unfair_char = '+'):
symbol = ''
if main_appData.iloc[idx]['appLabel'] == 'unfair':
symbol = str(idx)
elif main_appData.iloc[idx]['appLabel'] == 'fair':
symbol = fair_char
return symbol
In [1122]:
# Code reference: http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html#example-manifold-plot-lle-digits-py
#====================
#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X - x_min) / (x_max - x_min)
#pl.figure(figsize=(8, 6))
pl.figure()
ax = pl.subplot(1, 1, 1)
for i in range(X.shape[0]):
pl.text(X[i, 0], X[i, 1], str(set_plot_symbol(i)),
#color=pl.cm.Set1(y[i] / 2.),
fontdict={'weight': 'bold', 'size': 12})
pl.xticks([]), pl.yticks([])
if title is not None:
pl.title(title)
In [1123]:
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100, verbose=1)
t0 = time()
X_mds = clf.fit_transform(X_scaled)
print("Done. Stress: %f" % clf.stress_)
#plot_embedding(X_mds,
# "MDS embedding of the digits (time %.2fs)" %
# (time() - t0))
#print clf.get_params(True)
In [1124]:
#----------------------------------------------------------------------
# MDS embedding of the digits dataset
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Stress: %f" % clf.stress_)
#plot_embedding(X_mds,
# "MDS embedding of the digits (time %.2fs)" %
# (time() - t0))