I shall be using the previously exported exports/appFeatures.csv
data for this notebook
In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
## Load the appFeatures file
main_appData = pd.read_csv('exports/appFeatures.csv')
In [2]:
# This cell creates a dataframe with the same amount of unfair and fair apps, where the fair apps are randomly selected
# TODO - Create a method that implements k-fold validation for app selection
import random
## create unfair apps dataframe and count
df_unfair = main_appData[main_appData.appLabel == 'unfair']
unfair_count = df_unfair.appLabel.count()
## get same number of random fair apps
df_randomly_fair = main_appData[main_appData.appLabel == 'fair'].ix[random.sample(main_appData[main_appData.appLabel == 'fair'].index, unfair_count)]
#print df_randomly_fair.appLabel.count()
# append the newly created dataframe of unfair * 2 rows
appData = df_randomly_fair.append(df_unfair)
# shuffle the dataframe
appData = appData.ix[np.random.permutation(appData.index)]
appData.columns
Out[2]:
In [3]:
def trimDataFrame(df):
"""
Lets create a new dataframe for appFeatures and appLabels
"""
## for App Features
appCols = set(df.columns)
appCols.remove('appName') # remove app Names column
appCols.remove('Unnamed: 7') # removing a weird unnamed column
appCols.remove('appLabel') # removing the label column
appCols.remove('price') # removing price since most of the apps are free
appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0
df_trim = df[list(appCols)]
# -- boolean
df_trim['hasPrivacy'].astype(bool)
df_trim['hasDeveloperEmail'].astype(bool)
df_trim['hasDeveloperWebsite'].astype(bool)
# -- integer
df_trim['adjectiveCount'].astype(int)
df_trim['countCapital'].astype(int)
df_trim['installs'].astype(int)
df_trim['revSent'].astype(int)
df_trim['revLength'].astype(int)
# -- float
df_trim['avgRating'].astype(float)
return df_trim
Now, I want to explicitly set types to all my columns as a better practice
In [4]:
# Explicitly casting column types in appFeatures dataframe
appFeatures = trimDataFrame(main_appData)
# -- boolean
appFeatures['hasPrivacy'].astype(bool)
appFeatures['hasDeveloperEmail'].astype(bool)
appFeatures['hasDeveloperWebsite'].astype(bool)
# -- integer
appFeatures['adjectiveCount'].astype(int)
appFeatures['countCapital'].astype(int)
appFeatures['installs'].astype(int)
appFeatures['revSent'].astype(int)
appFeatures['revLength'].astype(int)
# -- float
appFeatures['avgRating'].astype(float)
appFeatures
Out[4]:
In [5]:
appFeatures['adjectiveCount'].hist(color=['#af9ecb'])
Out[5]:
In [6]:
appFeatures['avgRating'].hist(color=['#af9ecb'])
Out[6]:
In [7]:
appFeatures['countCapital'].hist(color=['#af9ecb'])
Out[7]:
In [8]:
appFeatures['hasDeveloperEmail'].hist(color=['#af9ecb'])
Out[8]:
In [9]:
appFeatures['hasDeveloperWebsite'].hist(color=['#af9ecb'])
Out[9]:
In [10]:
appFeatures['hasPrivacy'].hist(color=['#af9ecb'])
Out[10]:
In [11]:
appFeatures['installs'].hist(color=['#af9ecb'])
Out[11]:
In [12]:
appFeatures['revSent'].hist(color=['#af9ecb'])
Out[12]:
In [13]:
appFeatures['revLength'].hist(color=['#af9ecb'])
Out[13]:
In [14]:
appFeatures.describe()
Out[14]:
Plots inspired from Graphical Representations of linear models notebook
In [15]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
sns.set(palette="Purples_r")
np.random.seed(9221999)
mpl.rc("figure", figsize=(10, 10))
In [16]:
x = appFeatures['revSent'].astype(float)
y = appFeatures['avgRating'].astype(float)
sns.regplot(x,y)
In [17]:
x = appFeatures['revSent'].astype(float)
y = appFeatures['revLength'].astype(float)
sns.regplot(x,y)
In [18]:
x = appFeatures['revLength'].astype(float)
y = appFeatures['revSent'].astype(float)
sns.regplot(x,y)
In [19]:
f, ax = plt.subplots(1, 1, figsize=(10, 10))
cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF",
"#FFE6F8", "#C71585", "#8B0000"], as_cmap=True)
sns.corrplot(appFeatures, annot=False, diag_names=False, cmap=cmap, ax=ax);
In [20]:
sns.corrplot(appFeatures) #with values
Out[20]:
In [21]:
%doctest_mode
In [22]:
# get all the rows for unsupervised learning
appData_all = trimDataFrame(main_appData)
appData_all.head()
Out[22]:
In [23]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
print(appData_all.columns)
# scale the dataframe
X_scaled = min_max_scaler.fit_transform(appData_all)
print (X_scaled)
print X_scaled.astype('float64')
Y = main_appData['appLabel']
In [24]:
# Change plot string for fair and unfair apps\n
def set_plot_symbol(fairness, fair_char = '.', unfair_char = '+'):
symbol = ''
if fairness == 'unfair':
symbol = unfair_char
elif fairness == 'fair':
symbol = fair_char
return symbol
In [41]:
# Code reference: http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html#example-manifold-plot-lle-digits-py
#====================
#----------------------------------------------------------------------
# Scale and visualize the embedding vectors
def plot_embedding(X, title=None):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X - x_min) / (x_max - x_min)
pl.figure()
ax = pl.subplot(111)
for i in range(X.shape[0]):
pl.text(X[i, 0], X[i, 1], str(set_plot_symbol(Y[i])),
# color=pl.cm.Set1(y[i] / 2.),
fontdict={'weight': 'bold', 'size': 9})
pl.xticks([]), pl.yticks([])
if title is not None:
pl.title(title)
In [42]:
from time import time
import pylab as pl
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
random_projection)
In [43]:
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X_scaled)
print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_mds,
"MDS embedding of the digits (time %.2fs)" %
(time() - t0))
In [44]:
#----------------------------------------------------------------------
# Projection on to the first 2 principal components
print("Computing PCA projection")
t0 = time()
X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(X_scaled)
plot_embedding(X_pca,
"Principal Components projection of the digits (time %.2fs)" %
(time() - t0))
In [29]:
#----------------------------------------------------------------------
# Projection on to the first 2 linear discriminant components
print("Computing LDA projection")
X_converted = X_scaled.astype('float64')
X2 = X_converted.copy()
X2.flat[::X_converted.shape[1] + 1] += 0.01 # Make X invertible
t0 = time()
X_lda = lda.LDA(n_components=2).fit_transform(X2, y)
plot_embedding(X_lda,
"Linear Discriminant projection of the digits (time %.2fs)" %
(time() - t0))
In [30]:
#----------------------------------------------------------------------
# Isomap projection of the digits dataset
n_neighbors = 10
print("Computing Isomap embedding")
t0 = time()
X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X_scaled)
print("Done.")
plot_embedding(X_iso,
"Isomap projection of the digits (time %.2fs)" %
(time() - t0))
In [31]:
#----------------------------------------------------------------------
# Locally linear embedding of the digits dataset
print("Computing LLE embedding")
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
method='standard')
t0 = time()
X_lle = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
plot_embedding(X_lle,
"Locally Linear Embedding of the digits (time %.2fs)" %
(time() - t0))
In [32]:
#----------------------------------------------------------------------
# Modified Locally linear embedding of the digits dataset
print("Computing modified LLE embedding")
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
method='modified')
t0 = time()
X_mlle = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
plot_embedding(X_mlle,
"Modified Locally Linear Embedding of the digits (time %.2fs)" %
(time() - t0))
In [33]:
#----------------------------------------------------------------------
# HLLE embedding of the digits dataset
print("Computing Hessian LLE embedding")
clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,
method='hessian')
t0 = time()
X_hlle = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Reconstruction error: %g" % clf.reconstruction_error_)
plot_embedding(X_hlle,
"Hessian Locally Linear Embedding of the digits (time %.2fs)" %
(time() - t0))
In [34]:
#----------------------------------------------------------------------
# MDS embedding of the digits dataset
print("Computing MDS embedding")
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
t0 = time()
X_mds = clf.fit_transform(X_scaled.astype('float64'))
print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_mds,
"MDS embedding of the digits (time %.2fs)" %
(time() - t0))
In [35]:
#----------------------------------------------------------------------
# Random Trees embedding of the digits dataset
print("Computing Totally Random Trees embedding")
hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0,
max_depth=5)
t0 = time()
X_transformed = hasher.fit_transform(X_scaled.astype('float64'))
pca = decomposition.TruncatedSVD(n_components=2)
X_reduced = pca.fit_transform(X_transformed)
plot_embedding(X_reduced,
"Random forest embedding of the digits (time %.2fs)" %
(time() - t0))
In [36]:
#----------------------------------------------------------------------
# Spectral embedding of the digits dataset
print("Computing Spectral embedding")
embedder = manifold.SpectralEmbedding(n_components=2, random_state=0,
eigen_solver="arpack")
t0 = time()
X_se = embedder.fit_transform(X_scaled.astype('float64'))
plot_embedding(X_se,
"Spectral embedding of the digits (time %.2fs)" %
(time() - t0))
pl.show()
In [37]:
#
# ---------------K-means Clustering
#
from sklearn.cluster import KMeans
print("Computing K-means clustering")
kmeans = KMeans(init='k-means++', n_clusters=7, n_init=10)
t0 = time()
X_kmeans = kmeans.fit_transform(X_scaled)
# print("Done. Stress: %f" % clf.stress_)
plot_embedding(X_kmeans,
"Kmeans clusters (time %.2fs)" %
(time() - t0))
In [38]:
"""
Code reference: http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python
"""
from __future__ import division
# dendrogram plotting from scipy book
import numpy as np
import matplotlib.pyplot as mpl
from mpl_toolkits.mplot3d import Axes3D
import scipy.cluster.hierarchy as sch
from sklearn.metrics.pairwise import pairwise_distances as pwd
import pylab
# compute the distance matrix
D = pwd(X_converted, metric='euclidean')
# compute and plot the first dendrogram
fig = pylab.figure(figsize=(8,8))
ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
Y_sch = sch.linkage(D, method="centroid")
Z1 = sch.dendrogram(Y_sch, orientation='right')
ax1.set_xticks([])
ax1.set_yticks([])
# compute the second cluster
ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
Y_sch = sch.linkage(D, method='single')
Z2 = sch.dendrogram(Y_sch)
ax2.set_xticks([])
ax2.set_yticks([])
# plot the distance matrix
axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1, :]
D = D[:, idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])
#plot colorbar
axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6])
pylab.colorbar(im, cax=axcolor)
fig.show()
fig.savefig('dendrogram.png')
To be able to assess our classifier better, We have resampled our data to include equal number of fair and unfair apps. We will shuffle the records and do 10-fold validation on it.
In [39]:
# first get the feature matrix
# from Luis's random sampling of the dataframe
appData
X_resampled = trimDataFrame(appData)
X_resampled
X_for_classifier = min_max_scaler.fit_transform(X_resampled).astype('float64')
np.random.shuffle(X_for_classifier)
n_sample, n_features = X_for_classifier.shape
In [40]:
# setting up the cross validation
from sklearn import cross_validation
kf = cross_validation.KFold(n_sample, n_folds=3)
In [40]: