In [97]:
# for data manipulation
import numpy as np
import pandas as pd
import re
import string
import random
import datetime
import sys
import matplotlib.pyplot as plt
%matplotlib inline
# for MongoDB connection
import pymongo
# for statistical hypothesis testing
import scipy.stats
# for NLP & machine learning
from textblob import TextBlob
from textblob import Word
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion, 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [2]:
# for interactive plotting
import plotly.plotly as py
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.set_config_file(offline=True)



In [3]:
def read_mongo(collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """
    # Connect to MongoDB and Make a query to the specific DB and Collection
    with pymongo.MongoClient(host, port) as client:
        table = client.appstore[collection]
        df = pd.DataFrame(list(table.find(query)))    
    # Delete the _id
    if no_id:
        del df['_id']

    return df

As we know, new versions & updates are very common and important for developers to improve their apps quality as well as collect feedback. We would like to build a model to predict the app rating when new version is released. This could be helpful for developer to have a better sense about their product. What's more, it could be useful for recommendation system for new apps when the ratings are not available yet. First, we need to collect the data with both current rating and overall rating for our model training and testing.


In [4]:
apps_df = read_mongo('appitems', query = {'$and': [{'current_rating':{"$ne":None}}, {'overall_rating': {"$ne": None}}]})

In [5]:
apps_df.shape


Out[5]:
(4450, 26)

Data Cleaning & Feature Extraction

Next, we need to extract some useful features from the dataset. Note that our goal is to predict the new version rating, thus it doesn't make sense to use features that are relavent to the period after the new version release. We can't use the features like user reviews and number of current ratings. We will use the following features:

App characteristics

  • overall rating(category in the numeric form)
  • App category (category)
  • Is_InAppPurcase(category)
  • Is_multilingual(category)
  • Is_multiplatform(category)
  • Number of overall rating(log transform)
  • AppFileSize(log transform)
  • Time range between release date and extraction date(log transform)

Text Features from descriptions:

  • lengthOfText(character level)
  • NumOfwords(word level)
  • polarity
  • subjectivity

In [6]:
# datetime manipulation
apps_df['update_date'] = pd.to_datetime(apps_df['update_date'])
apps_df['publish_date'] = map(lambda x: datetime.datetime.strptime(x.split(' ')[0], '%Y-%m-%d'), apps_df['publish_date'])
apps_df['time_span'] = map(lambda x,y: np.log((x-y).days+1), apps_df['update_date'], apps_df['publish_date'])

In [7]:
# string object manipulation
rating_cleaned = {'1 star':1, "1 and a half stars": 1.5, '2 stars': 2, '2 and a half stars':2.5, "3 stars":3, "3 and a half stars":3.5, "4 stars": 4,
                 '4 and a half stars': 4.5, "5 stars": 5}
apps_df.overall_rating = apps_df.overall_rating.replace(rating_cleaned)
apps_df['is_free'] = apps_df['price'].apply(lambda x: int(x=='Free'))
apps_df['app_size']= apps_df['size'].str.split(' ').apply(lambda x: np.log(float(x[0])))
apps_df['num_overall_rating'] = apps_df['num_overall_rating'].apply(lambda x: np.log(x))

In [8]:
# helper function for NLP
def initial_clean_text(raw):
    # use the decode method to convert to ascii (textblob prefers ascii)
    raw = raw.decode('ascii', errors="replace")
    #remove link
    raw_no_link = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', raw)
    #remove email
    no_link_email = re.sub(r'([\w\.-]+)?@[\w\.-]+([\.][com|org|ch|uk]{2,3})?', "", raw_no_link)
    #remove whitespace & special characters
    tab_text = '\t\n\r\x0b\x0c*-'
    no_link_email_space = "".join([ch for ch in no_link_email if ch not in set(tab_text)])
    row = no_link_email_space.lower() 
    return row

# characters count
def char_num(text, ignore_spaces=True):
    if ignore_spaces:
        text = text.replace(" ", "")
    text = "".join(x for x in text if x not in list(set(string.punctuation)))
    return np.log(len(text)+1)

# words count
def word_num(text):
    text = "".join(x for x in text if x not in list(set(string.punctuation)))
    text = word_tokenize(text)
    return np.log(len(text)+1)

# subjectivity
def get_sentiment(text):
    sent = TextBlob(text)
    polarity = sent.sentiment.polarity
    subjectivity = sent.sentiment.subjectivity
    return polarity, subjectivity

In [9]:
apps_df['description'] = apps_df['description'].apply(initial_clean_text)
apps_df['new_version_desc'] = apps_df['new_version_desc'].apply(initial_clean_text)
apps_df['description_charlen'] = apps_df['description'].apply(char_num)
apps_df['description_wordlen'] = apps_df['description'].apply(word_num)
apps_df['new_version_desc_charlen'] = apps_df['new_version_desc'].apply(char_num)
apps_df['new_version_desc_wordlen'] = apps_df['new_version_desc'].apply(word_num)
apps_df['description_polarity'], apps_df['description_subjectivity']  = zip(*apps_df['description'].map(get_sentiment))
apps_df['new_version_desc_polarity'], apps_df['new_version_desc_subjectivity']  = zip(*apps_df['new_version_desc'].map(get_sentiment))

In [10]:
apps_df.head().T


Out[10]:
0 1 2 3 4
category Finance Finance Finance Finance Finance
current_rating 1.8 4.55556 4.64497 2.16667 4.60773
description the jpay app lets you send money and email to ... splitwise is the best way to share bills and i... access your interactive experian credit report... use the vanguard app to check your accounts an... conveniently manage your credit card account f...
id 584959322 458023433 1087101090 335186209 1128712763
is_InAppPurcased 0 0 1 0 0
is_multilingual 0 0 0 0 0
is_multiplatform 0 0 0 0 0
name JPay Splitwise - Split bills and expenses the easy way Experian - Free Credit Report Vanguard Credit One Bank Mobile
new_version_desc this update has a big new feature for the new ... starting today users outside the us can invit... bug fixes and other minor updates see what's new! have an ira? watch your progr... ability add an additional account if qualifie...
num_current_rating 20 18 169 30 724
num_overall_rating 6.12687 8.52576 7.27448 7.75919 7.06561
overall_rating 2.5 4.5 4.5 2.5 4.5
price Free Free Free Free Free
publish_date 2013-01-12 00:00:00 2011-08-24 00:00:00 2016-05-16 00:00:00 2009-10-24 00:00:00 2016-09-22 00:00:00
review1 And when you were younger and you got like a b... I have used a few apps for splitting balance b... This is a well thought out app that works real... Viewing general account info with Touch ID is ... Great Job! I've been waiting and I'm so glad ...
review1_star 5 5 5 4 5
review2 What is wrong with this freakin App? It keep s... This app has been great for keeping track of e... This Experian app helps me measure my credit a... The app is very easy to use and has tons of he... I am so fed up with this entire company! I'm g...
review2_star 1 5 5 5 2
review3 Emails are not sent and received in a timely m... One of the best App! We have used it for many ... I don't exactly like credit bureaus my experie... My company used to use Fidelity for our retire... I have no complaints, well one but its not a b...
review3_star 1 5 5 1 5
scrape_date 2017-03-13 2017-03-13 2017-03-13 2017-03-13 2017-03-13
seller JPay, Inc. None CONSUMERINFO.COM, INC. The Vanguard Group, Inc. Credit One Bank, National Association
size 13.0 MB 83.0 MB 58.0 MB 49.0 MB 42.8 MB
update_date 2017-01-20 00:00:00 2017-02-27 00:00:00 2017-02-22 00:00:00 2017-01-22 00:00:00 2017-01-31 00:00:00
url https://itunes.apple.com/us/app/jpay/id5849593... https://itunes.apple.com/us/app/splitwise-spli... https://itunes.apple.com/us/app/experian-free-... https://itunes.apple.com/us/app/vanguard/id335... https://itunes.apple.com/us/app/credit-one-ban...
version 4.7 4.4.6 1.6.1 7.1 1.4
time_span 7.29302 7.60837 5.64545 7.88156 4.8828
is_free 1 1 1 1 1
app_size 2.56495 4.41884 4.06044 3.89182 3.75654
description_charlen 6.19032 6.58479 7.88833 7.22475 6.57368
description_wordlen 4.58497 5.10595 6.23245 5.56834 4.95583
new_version_desc_charlen 5.3845 6.11589 3.3673 5.09375 4.26268
new_version_desc_wordlen 4.02535 4.58497 1.94591 3.46574 2.48491
description_polarity 0.271667 0.232057 0.212641 0.0947917 0.232292
description_subjectivity 0.38 0.618115 0.507532 0.515625 0.439583
new_version_desc_polarity 0.151136 0.177604 -0.0875 -0.0147727 0
new_version_desc_subjectivity 0.322348 0.388542 0.2875 0.427273 0

Modeling

Aftet we clean the data and feature extraction, we are ready for building our machine learning model. Since the target variable current_rating is a continuous variable, it's obviously a regression problem. With the fantastic scikit-learn package, we will use the ElasticNet and RandomForestRegressor models since they are useful for controlling overfitting and have fast computation. First, we will experiment and compare these two models with only the numerical features that we extracted from the last step. Furthermore, we could combine the word representation features into the pipeline with FeatureUnion to test whether we could improve our accuracy.


In [23]:
# one-hot encoder for multi-category variable
appcategory_vars = pd.get_dummies(apps_df['category'])
appcategory_columns = appcategory_vars.columns
apps_df = pd.concat([apps_df, appcategory_vars], axis =1)

In [29]:
appcategory_columns


Out[29]:
Index([u'Books', u'Business', u'Catalogs', u'Education', u'Entertainment',
       u'Finance', u'Food & Drink', u'Games', u'Health & Fitness',
       u'Lifestyle', u'Medical', u'Music', u'Navigation', u'News',
       u'Photo & Video', u'Productivity', u'Reference', u'Shopping',
       u'Social Networking', u'Sports', u'Travel', u'Utilities', u'Weather'],
      dtype='object')

In [76]:
model_columns = ['overall_rating', 'num_overall_rating', 'is_InAppPurcased', 'is_multilingual', 'is_multiplatform','is_free',\
                  'app_size', 'time_span', 'description_charlen', 'description_wordlen', 'new_version_desc_charlen', 'new_version_desc_wordlen',\
                 'description_polarity', 'new_version_desc_polarity','description_subjectivity','new_version_desc_subjectivity'] + list(appcategory_columns)
X_train, X_test, y_train, y_test = train_test_split(apps_df[model_columns], apps_df['current_rating'], test_size = 0.33, random_state = 42)

In [30]:
print X_train.shape, X_test.shape


(2981, 39) (1469, 39)

Generalized linear model

We use the build-in cross validation for training ElasticNet.


In [37]:
glm = ElasticNetCV(l1_ratio= np.array([0.5, 0.7, .9, .95, .98, 1]), cv = 5)
glm.fit(X_train, y_train)


Out[37]:
ElasticNetCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
       l1_ratio=array([ 0.5 ,  0.7 ,  0.9 ,  0.95,  0.98,  1.  ]),
       max_iter=1000, n_alphas=100, n_jobs=1, normalize=False,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=0)

In [38]:
print "best lasso ratio: ", glm.l1_ratio_, "in pentalty terms:", glm.alpha_


best lasso ratio:  0.5 in pentalty terms: 0.00359257381705

In [50]:
def report_metrics(actuals, preds):
    print "mean absolute error: ", metrics.mean_absolute_error(actuals, preds)
    print "mean square error: ", metrics.mean_squared_error(actuals, preds)
    print "median absolute error: ", metrics.median_absolute_error(actuals, preds)
    print "R square: ", metrics.r2_score(actuals, preds)

In [51]:
glm_preds = glm.predict(X_test)
report_metrics(y_test, glm_preds)


mean absolute error:  0.482777336567
mean square error:  0.436456532731
median absolute error:  0.350115645605
R square:  0.541522061615

RandomForestRegressor


In [64]:
rfr = RandomForestRegressor(n_estimators=200, n_jobs=-1, max_depth= 6)
rfr.fit(X_train, y_train)


Out[64]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [65]:
rfr_preds = rfr.predict(X_test)
report_metrics(y_test, rfr_preds)


mean absolute error:  0.479319609645
mean square error:  0.432291963247
median absolute error:  0.347903340425
R square:  0.54589675437

In [68]:
# grid search
pg = {'max_depth': [4, 6, 8, 10], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 4], 'min_samples_leaf': [1, 3, 5 ]}
grid = GridSearchCV(rfr, param_grid=pg, cv=5)
grid.fit(X_train, y_train)


Out[68]:
GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 4], 'max_depth': [4, 6, 8, 10], 'min_samples_leaf': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [69]:
print "best parameters: ", grid.best_params_, ' with score ', grid.best_score_


best parameters:  {'max_features': 'auto', 'min_samples_split': 2, 'max_depth': 4, 'min_samples_leaf': 5}  with score  0.486118126264

In [72]:
grfr_preds = grid.best_estimator_.predict(X_test)
report_metrics(y_test, grfr_preds) # a little gains through grid search optimization


mean absolute error:  0.4784000569
mean square error:  0.431377974995
median absolute error:  0.345737345399
R square:  0.54685685788

In [79]:
#show to plot importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(grid.best_estimator_.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.plot.bar()


Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0x171ff7b8>

Combined word representation features

We know the description usually tells the features of the apps. We can incorporate the raw text data and see whether we can improve our model performance, which could indicate the desciption has some predictive power for user experiences and app quality.


In [89]:
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
verb_exp = ['VB', 'VBZ', 'VBP', 'VBD','VBN','VBG']
def clean_tokenize_text(row):
    #remove numbers
    raw = re.sub('[0-9]+?', ' ', row) 
    # remove punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    raw = regex.sub(' ', raw)
    #clean out the characters left out after the above step, like we’re, I’m, It’s, i.e.
    raw = re.sub('( s )|( re )|( m )|( i e )',' ',raw) 
    # lementize
    row_t = TextBlob(raw)
    tokens = []
    for word, pos in row_t.tags:
        if pos in verb_exp:
            word = Word(word)
            word = word.lemmatize("v")
        else:
            word = Word(word)
            word = word.lemmatize() |
        if word not in STOPLIST:
            tokens.append(word)# remove stop words     
    return(tokens)

class ColumnSelector(TransformerMixin):
    """
    Class for building sklearn Pipeline step. This class should be used to select a column from a pandas data frame.
    """
    def __init__(self, columns):
        self.columns = columns

    def fit(self, x, y=None):
        return self

    def transform(self, data_frame):
        return data_frame[self.columns]

In [112]:
new_columns = model_columns+['description']
X_train, X_test, y_train, y_test = train_test_split(apps_df[new_columns], apps_df['current_rating'], test_size = 0.33, random_state = 42)

In [121]:
random.seed(27)
vectorizer = TfidfVectorizer(tokenizer=clean_tokenize_text, min_df=0.03, max_features = 20000, ngram_range = (1, 2), )#2 grams
select_columns = ColumnSelector(columns = model_columns)
rfr_model =  RandomForestRegressor(n_estimators=300, n_jobs=-1, max_depth= 6, min_samples_leaf= 5, min_samples_split = 2)

In [125]:
get_wordvec = Pipeline([('select_desc',ColumnSelector(['description'])), ('word_vec', vectorizer)])
ml_pipe = Pipeline([
    ('features', FeatureUnion([('wordvec_features', get_wordvec),
                               ('numerical_features', select_columns)                               
                                ])),
    ('rf_regressor', rfr_model)
 ])

In [123]:
get_wordvec.fit(X_train, y_train)


Out[123]:
Pipeline(steps=[('select_desc', <__main__.ColumnSelector object at 0x000000001D60A518>), ('word_vec', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=20000, min_df...nizer=<function clean_tokenize_text at 0x0000000019FDDCF8>,
        use_idf=True, vocabulary=None))])

In [124]:
get_wordvec.transform(X_test)


Out[124]:
<1x1 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>