In [97]:

    
# for data manipulation
import numpy as np
import pandas as pd
import re
import string
import random
import datetime
import sys
import matplotlib.pyplot as plt
%matplotlib inline
# for MongoDB connection
import pymongo
# for statistical hypothesis testing
import scipy.stats
# for NLP & machine learning
from textblob import TextBlob
from textblob import Word
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion, 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV



In [2]:

    
# for interactive plotting
import plotly.plotly as py
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.set_config_file(offline=True)



In [3]:

    
def read_mongo(collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """
    # Connect to MongoDB and Make a query to the specific DB and Collection
    with pymongo.MongoClient(host, port) as client:
        table = client.appstore[collection]
        df = pd.DataFrame(list(table.find(query)))    
    # Delete the _id
    if no_id:
        del df['_id']

    return df

As we know, new versions & updates are very common and important for developers to improve their apps quality as well as collect feedback. We would like to build a model to predict the app rating when new version is released. This could be helpful for developer to have a better sense about their product. What's more, it could be useful for recommendation system for new apps when the ratings are not available yet. First, we need to collect the data with both current rating and overall rating for our model training and testing.



In [4]:

    
apps_df = read_mongo('appitems', query = {'$and': [{'current_rating':{"$ne":None}}, {'overall_rating': {"$ne": None}}]})



In [5]:

    
apps_df.shape









    Out[5]:





(4450, 26)

Data Cleaning & Feature Extraction

Next, we need to extract some useful features from the dataset. Note that our goal is to predict the new version rating, thus it doesn't make sense to use features that are relavent to the period after the new version release. We can't use the features like user reviews and number of current ratings. We will use the following features:

App characteristics

overall rating(category in the numeric form)
App category (category)
Is_InAppPurcase(category)
Is_multilingual(category)
Is_multiplatform(category)
Number of overall rating(log transform)
AppFileSize(log transform)
Time range between release date and extraction date(log transform)

Text Features from descriptions:

lengthOfText(character level)
NumOfwords(word level)
polarity
subjectivity



In [6]:

    
# datetime manipulation
apps_df['update_date'] = pd.to_datetime(apps_df['update_date'])
apps_df['publish_date'] = map(lambda x: datetime.datetime.strptime(x.split(' ')[0], '%Y-%m-%d'), apps_df['publish_date'])
apps_df['time_span'] = map(lambda x,y: np.log((x-y).days+1), apps_df['update_date'], apps_df['publish_date'])



In [7]:

    
# string object manipulation
rating_cleaned = {'1 star':1, "1 and a half stars": 1.5, '2 stars': 2, '2 and a half stars':2.5, "3 stars":3, "3 and a half stars":3.5, "4 stars": 4,
                 '4 and a half stars': 4.5, "5 stars": 5}
apps_df.overall_rating = apps_df.overall_rating.replace(rating_cleaned)
apps_df['is_free'] = apps_df['price'].apply(lambda x: int(x=='Free'))
apps_df['app_size']= apps_df['size'].str.split(' ').apply(lambda x: np.log(float(x[0])))
apps_df['num_overall_rating'] = apps_df['num_overall_rating'].apply(lambda x: np.log(x))



In [8]:

    
# helper function for NLP
def initial_clean_text(raw):
    # use the decode method to convert to ascii (textblob prefers ascii)
    raw = raw.decode('ascii', errors="replace")
    #remove link
    raw_no_link = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', raw)
    #remove email
    no_link_email = re.sub(r'([\w\.-]+)?@[\w\.-]+([\.][com|org|ch|uk]{2,3})?', "", raw_no_link)
    #remove whitespace & special characters
    tab_text = '\t\n\r\x0b\x0c*-'
    no_link_email_space = "".join([ch for ch in no_link_email if ch not in set(tab_text)])
    row = no_link_email_space.lower() 
    return row

# characters count
def char_num(text, ignore_spaces=True):
    if ignore_spaces:
        text = text.replace(" ", "")
    text = "".join(x for x in text if x not in list(set(string.punctuation)))
    return np.log(len(text)+1)

# words count
def word_num(text):
    text = "".join(x for x in text if x not in list(set(string.punctuation)))
    text = word_tokenize(text)
    return np.log(len(text)+1)

# subjectivity
def get_sentiment(text):
    sent = TextBlob(text)
    polarity = sent.sentiment.polarity
    subjectivity = sent.sentiment.subjectivity
    return polarity, subjectivity



In [9]:

    
apps_df['description'] = apps_df['description'].apply(initial_clean_text)
apps_df['new_version_desc'] = apps_df['new_version_desc'].apply(initial_clean_text)
apps_df['description_charlen'] = apps_df['description'].apply(char_num)
apps_df['description_wordlen'] = apps_df['description'].apply(word_num)
apps_df['new_version_desc_charlen'] = apps_df['new_version_desc'].apply(char_num)
apps_df['new_version_desc_wordlen'] = apps_df['new_version_desc'].apply(word_num)
apps_df['description_polarity'], apps_df['description_subjectivity']  = zip(*apps_df['description'].map(get_sentiment))
apps_df['new_version_desc_polarity'], apps_df['new_version_desc_subjectivity']  = zip(*apps_df['new_version_desc'].map(get_sentiment))



In [10]:

    
apps_df.head().T









    Out[10]:






  
    
      
      0
      1
      2
      3
      4
    
  
  
    
      category
      Finance
      Finance
      Finance
      Finance
      Finance
    
    
      current_rating
      1.8
      4.55556
      4.64497
      2.16667
      4.60773
    
    
      description
      the jpay app lets you send money and email to ...
      splitwise is the best way to share bills and i...
      access your interactive experian credit report...
      use the vanguard app to check your accounts an...
      conveniently manage your credit card account f...
    
    
      id
      584959322
      458023433
      1087101090
      335186209
      1128712763
    
    
      is_InAppPurcased
      0
      0
      1
      0
      0
    
    
      is_multilingual
      0
      0
      0
      0
      0
    
    
      is_multiplatform
      0
      0
      0
      0
      0
    
    
      name
      JPay
      Splitwise - Split bills and expenses the easy way
      Experian - Free Credit Report
      Vanguard
      Credit One Bank Mobile
    
    
      new_version_desc
      this update has a big new feature for the new ...
      starting today users outside the us can invit...
      bug fixes and other minor updates
      see what's new!  have an ira? watch your progr...
      ability add an additional account if qualifie...
    
    
      num_current_rating
      20
      18
      169
      30
      724
    
    
      num_overall_rating
      6.12687
      8.52576
      7.27448
      7.75919
      7.06561
    
    
      overall_rating
      2.5
      4.5
      4.5
      2.5
      4.5
    
    
      price
      Free
      Free
      Free
      Free
      Free
    
    
      publish_date
      2013-01-12 00:00:00
      2011-08-24 00:00:00
      2016-05-16 00:00:00
      2009-10-24 00:00:00
      2016-09-22 00:00:00
    
    
      review1
      And when you were younger and you got like a b...
      I have used a few apps for splitting balance b...
      This is a well thought out app that works real...
      Viewing general account info with Touch ID is ...
      Great Job!  I've been waiting and I'm so glad ...
    
    
      review1_star
      5
      5
      5
      4
      5
    
    
      review2
      What is wrong with this freakin App? It keep s...
      This app has been great for keeping track of e...
      This Experian app helps me measure my credit a...
      The app is very easy to use and has tons of he...
      I am so fed up with this entire company! I'm g...
    
    
      review2_star
      1
      5
      5
      5
      2
    
    
      review3
      Emails are not sent and received in a timely m...
      One of the best App! We have used it for many ...
      I don't exactly like credit bureaus my experie...
      My company used to use Fidelity for our retire...
      I have no complaints, well one but its not a b...
    
    
      review3_star
      1
      5
      5
      1
      5
    
    
      scrape_date
      2017-03-13
      2017-03-13
      2017-03-13
      2017-03-13
      2017-03-13
    
    
      seller
      JPay, Inc.
      None
      CONSUMERINFO.COM, INC.
      The Vanguard Group, Inc.
      Credit One Bank, National Association
    
    
      size
      13.0 MB
      83.0 MB
      58.0 MB
      49.0 MB
      42.8 MB
    
    
      update_date
      2017-01-20 00:00:00
      2017-02-27 00:00:00
      2017-02-22 00:00:00
      2017-01-22 00:00:00
      2017-01-31 00:00:00
    
    
      url
      https://itunes.apple.com/us/app/jpay/id5849593...
      https://itunes.apple.com/us/app/splitwise-spli...
      https://itunes.apple.com/us/app/experian-free-...
      https://itunes.apple.com/us/app/vanguard/id335...
      https://itunes.apple.com/us/app/credit-one-ban...
    
    
      version
      4.7
      4.4.6
      1.6.1
      7.1
      1.4
    
    
      time_span
      7.29302
      7.60837
      5.64545
      7.88156
      4.8828
    
    
      is_free
      1
      1
      1
      1
      1
    
    
      app_size
      2.56495
      4.41884
      4.06044
      3.89182
      3.75654
    
    
      description_charlen
      6.19032
      6.58479
      7.88833
      7.22475
      6.57368
    
    
      description_wordlen
      4.58497
      5.10595
      6.23245
      5.56834
      4.95583
    
    
      new_version_desc_charlen
      5.3845
      6.11589
      3.3673
      5.09375
      4.26268
    
    
      new_version_desc_wordlen
      4.02535
      4.58497
      1.94591
      3.46574
      2.48491
    
    
      description_polarity
      0.271667
      0.232057
      0.212641
      0.0947917
      0.232292
    
    
      description_subjectivity
      0.38
      0.618115
      0.507532
      0.515625
      0.439583
    
    
      new_version_desc_polarity
      0.151136
      0.177604
      -0.0875
      -0.0147727
      0
    
    
      new_version_desc_subjectivity
      0.322348
      0.388542
      0.2875
      0.427273
      0

Modeling

Aftet we clean the data and feature extraction, we are ready for building our machine learning model. Since the target variable current_rating is a continuous variable, it's obviously a regression problem. With the fantastic scikit-learn package, we will use the ElasticNet and RandomForestRegressor models since they are useful for controlling overfitting and have fast computation. First, we will experiment and compare these two models with only the numerical features that we extracted from the last step. Furthermore, we could combine the word representation features into the pipeline with FeatureUnion to test whether we could improve our accuracy.



In [23]:

    
# one-hot encoder for multi-category variable
appcategory_vars = pd.get_dummies(apps_df['category'])
appcategory_columns = appcategory_vars.columns
apps_df = pd.concat([apps_df, appcategory_vars], axis =1)



In [29]:

    
appcategory_columns









    Out[29]:





Index([u'Books', u'Business', u'Catalogs', u'Education', u'Entertainment',
       u'Finance', u'Food & Drink', u'Games', u'Health & Fitness',
       u'Lifestyle', u'Medical', u'Music', u'Navigation', u'News',
       u'Photo & Video', u'Productivity', u'Reference', u'Shopping',
       u'Social Networking', u'Sports', u'Travel', u'Utilities', u'Weather'],
      dtype='object')



In [76]:

    
model_columns = ['overall_rating', 'num_overall_rating', 'is_InAppPurcased', 'is_multilingual', 'is_multiplatform','is_free',\
                  'app_size', 'time_span', 'description_charlen', 'description_wordlen', 'new_version_desc_charlen', 'new_version_desc_wordlen',\
                 'description_polarity', 'new_version_desc_polarity','description_subjectivity','new_version_desc_subjectivity'] + list(appcategory_columns)
X_train, X_test, y_train, y_test = train_test_split(apps_df[model_columns], apps_df['current_rating'], test_size = 0.33, random_state = 42)



In [30]:

    
print X_train.shape, X_test.shape









    



(2981, 39) (1469, 39)

Generalized linear model

We use the build-in cross validation for training ElasticNet.



In [37]:

    
glm = ElasticNetCV(l1_ratio= np.array([0.5, 0.7, .9, .95, .98, 1]), cv = 5)
glm.fit(X_train, y_train)









    Out[37]:





ElasticNetCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
       l1_ratio=array([ 0.5 ,  0.7 ,  0.9 ,  0.95,  0.98,  1.  ]),
       max_iter=1000, n_alphas=100, n_jobs=1, normalize=False,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=0)



In [38]:

    
print "best lasso ratio: ", glm.l1_ratio_, "in pentalty terms:", glm.alpha_









    



best lasso ratio:  0.5 in pentalty terms: 0.00359257381705



In [50]:

    
def report_metrics(actuals, preds):
    print "mean absolute error: ", metrics.mean_absolute_error(actuals, preds)
    print "mean square error: ", metrics.mean_squared_error(actuals, preds)
    print "median absolute error: ", metrics.median_absolute_error(actuals, preds)
    print "R square: ", metrics.r2_score(actuals, preds)



In [51]:

    
glm_preds = glm.predict(X_test)
report_metrics(y_test, glm_preds)









    



mean absolute error:  0.482777336567
mean square error:  0.436456532731
median absolute error:  0.350115645605
R square:  0.541522061615

RandomForestRegressor



In [64]:

    
rfr = RandomForestRegressor(n_estimators=200, n_jobs=-1, max_depth= 6)
rfr.fit(X_train, y_train)









    Out[64]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)



In [65]:

    
rfr_preds = rfr.predict(X_test)
report_metrics(y_test, rfr_preds)









    



mean absolute error:  0.479319609645
mean square error:  0.432291963247
median absolute error:  0.347903340425
R square:  0.54589675437



In [68]:

    
# grid search
pg = {'max_depth': [4, 6, 8, 10], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 4], 'min_samples_leaf': [1, 3, 5 ]}
grid = GridSearchCV(rfr, param_grid=pg, cv=5)
grid.fit(X_train, y_train)









    Out[68]:





GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 4], 'max_depth': [4, 6, 8, 10], 'min_samples_leaf': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)



In [69]:

    
print "best parameters: ", grid.best_params_, ' with score ', grid.best_score_









    



best parameters:  {'max_features': 'auto', 'min_samples_split': 2, 'max_depth': 4, 'min_samples_leaf': 5}  with score  0.486118126264



In [72]:

    
grfr_preds = grid.best_estimator_.predict(X_test)
report_metrics(y_test, grfr_preds) # a little gains through grid search optimization









    



mean absolute error:  0.4784000569
mean square error:  0.431377974995
median absolute error:  0.345737345399
R square:  0.54685685788



In [79]:

    
#show to plot importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(grid.best_estimator_.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.plot.bar()









    Out[79]:





<matplotlib.axes._subplots.AxesSubplot at 0x171ff7b8>

Combined word representation features

We know the description usually tells the features of the apps. We can incorporate the raw text data and see whether we can improve our model performance, which could indicate the desciption has some predictive power for user experiences and app quality.



In [89]:

    
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
verb_exp = ['VB', 'VBZ', 'VBP', 'VBD','VBN','VBG']
def clean_tokenize_text(row):
    #remove numbers
    raw = re.sub('[0-9]+?', ' ', row) 
    # remove punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    raw = regex.sub(' ', raw)
    #clean out the characters left out after the above step, like we’re, I’m, It’s, i.e.
    raw = re.sub('( s )|( re )|( m )|( i e )',' ',raw) 
    # lementize
    row_t = TextBlob(raw)
    tokens = []
    for word, pos in row_t.tags:
        if pos in verb_exp:
            word = Word(word)
            word = word.lemmatize("v")
        else:
            word = Word(word)
            word = word.lemmatize() |
        if word not in STOPLIST:
            tokens.append(word)# remove stop words     
    return(tokens)

class ColumnSelector(TransformerMixin):
    """
    Class for building sklearn Pipeline step. This class should be used to select a column from a pandas data frame.
    """
    def __init__(self, columns):
        self.columns = columns

    def fit(self, x, y=None):
        return self

    def transform(self, data_frame):
        return data_frame[self.columns]



In [112]:

    
new_columns = model_columns+['description']
X_train, X_test, y_train, y_test = train_test_split(apps_df[new_columns], apps_df['current_rating'], test_size = 0.33, random_state = 42)



In [121]:

    
random.seed(27)
vectorizer = TfidfVectorizer(tokenizer=clean_tokenize_text, min_df=0.03, max_features = 20000, ngram_range = (1, 2), )#2 grams
select_columns = ColumnSelector(columns = model_columns)
rfr_model =  RandomForestRegressor(n_estimators=300, n_jobs=-1, max_depth= 6, min_samples_leaf= 5, min_samples_split = 2)



In [125]:

    
get_wordvec = Pipeline([('select_desc',ColumnSelector(['description'])), ('word_vec', vectorizer)])
ml_pipe = Pipeline([
    ('features', FeatureUnion([('wordvec_features', get_wordvec),
                               ('numerical_features', select_columns)                               
                                ])),
    ('rf_regressor', rfr_model)
 ])



In [123]:

    
get_wordvec.fit(X_train, y_train)









    Out[123]:





Pipeline(steps=[('select_desc', <__main__.ColumnSelector object at 0x000000001D60A518>), ('word_vec', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=20000, min_df...nizer=<function clean_tokenize_text at 0x0000000019FDDCF8>,
        use_idf=True, vocabulary=None))])



In [124]:

    
get_wordvec.transform(X_test)









    Out[124]:





<1x1 sparse matrix of type '<type 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

	0	1	2	3	4
category	Finance	Finance	Finance	Finance	Finance
current_rating	1.8	4.55556	4.64497	2.16667	4.60773
description	the jpay app lets you send money and email to ...	splitwise is the best way to share bills and i...	access your interactive experian credit report...	use the vanguard app to check your accounts an...	conveniently manage your credit card account f...
id	584959322	458023433	1087101090	335186209	1128712763
is_InAppPurcased	0	0	1	0	0
is_multilingual	0	0	0	0	0
is_multiplatform	0	0	0	0	0
name	JPay	Splitwise - Split bills and expenses the easy way	Experian - Free Credit Report	Vanguard	Credit One Bank Mobile
new_version_desc	this update has a big new feature for the new ...	starting today users outside the us can invit...	bug fixes and other minor updates	see what's new! have an ira? watch your progr...	ability add an additional account if qualifie...
num_current_rating	20	18	169	30	724
num_overall_rating	6.12687	8.52576	7.27448	7.75919	7.06561
overall_rating	2.5	4.5	4.5	2.5	4.5
price	Free	Free	Free	Free	Free
publish_date	2013-01-12 00:00:00	2011-08-24 00:00:00	2016-05-16 00:00:00	2009-10-24 00:00:00	2016-09-22 00:00:00
review1	And when you were younger and you got like a b...	I have used a few apps for splitting balance b...	This is a well thought out app that works real...	Viewing general account info with Touch ID is ...	Great Job! I've been waiting and I'm so glad ...
review1_star	5	5	5	4	5
review2	What is wrong with this freakin App? It keep s...	This app has been great for keeping track of e...	This Experian app helps me measure my credit a...	The app is very easy to use and has tons of he...	I am so fed up with this entire company! I'm g...
review2_star	1	5	5	5	2
review3	Emails are not sent and received in a timely m...	One of the best App! We have used it for many ...	I don't exactly like credit bureaus my experie...	My company used to use Fidelity for our retire...	I have no complaints, well one but its not a b...
review3_star	1	5	5	1	5
scrape_date	2017-03-13	2017-03-13	2017-03-13	2017-03-13	2017-03-13
seller	JPay, Inc.	None	CONSUMERINFO.COM, INC.	The Vanguard Group, Inc.	Credit One Bank, National Association
size	13.0 MB	83.0 MB	58.0 MB	49.0 MB	42.8 MB
update_date	2017-01-20 00:00:00	2017-02-27 00:00:00	2017-02-22 00:00:00	2017-01-22 00:00:00	2017-01-31 00:00:00
url	https://itunes.apple.com/us/app/jpay/id5849593...	https://itunes.apple.com/us/app/splitwise-spli...	https://itunes.apple.com/us/app/experian-free-...	https://itunes.apple.com/us/app/vanguard/id335...	https://itunes.apple.com/us/app/credit-one-ban...
version	4.7	4.4.6	1.6.1	7.1	1.4
time_span	7.29302	7.60837	5.64545	7.88156	4.8828
is_free	1	1	1	1	1
app_size	2.56495	4.41884	4.06044	3.89182	3.75654
description_charlen	6.19032	6.58479	7.88833	7.22475	6.57368
description_wordlen	4.58497	5.10595	6.23245	5.56834	4.95583
new_version_desc_charlen	5.3845	6.11589	3.3673	5.09375	4.26268
new_version_desc_wordlen	4.02535	4.58497	1.94591	3.46574	2.48491
description_polarity	0.271667	0.232057	0.212641	0.0947917	0.232292
description_subjectivity	0.38	0.618115	0.507532	0.515625	0.439583
new_version_desc_polarity	0.151136	0.177604	-0.0875	-0.0147727	0
new_version_desc_subjectivity	0.322348	0.388542	0.2875	0.427273	0