In [97]:
# for data manipulation
import numpy as np
import pandas as pd
import re
import string
import random
import datetime
import sys
import matplotlib.pyplot as plt
%matplotlib inline
# for MongoDB connection
import pymongo
# for statistical hypothesis testing
import scipy.stats
# for NLP & machine learning
from textblob import TextBlob
from textblob import Word
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion,
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
In [2]:
# for interactive plotting
import plotly.plotly as py
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.set_config_file(offline=True)
In [3]:
def read_mongo(collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
""" Read from Mongo and Store into DataFrame """
# Connect to MongoDB and Make a query to the specific DB and Collection
with pymongo.MongoClient(host, port) as client:
table = client.appstore[collection]
df = pd.DataFrame(list(table.find(query)))
# Delete the _id
if no_id:
del df['_id']
return df
As we know, new versions & updates are very common and important for developers to improve their apps quality as well as collect feedback. We would like to build a model to predict the app rating when new version is released. This could be helpful for developer to have a better sense about their product. What's more, it could be useful for recommendation system for new apps when the ratings are not available yet. First, we need to collect the data with both current rating and overall rating for our model training and testing.
In [4]:
apps_df = read_mongo('appitems', query = {'$and': [{'current_rating':{"$ne":None}}, {'overall_rating': {"$ne": None}}]})
In [5]:
apps_df.shape
Out[5]:
Next, we need to extract some useful features from the dataset. Note that our goal is to predict the new version rating, thus it doesn't make sense to use features that are relavent to the period after the new version release. We can't use the features like user reviews and number of current ratings. We will use the following features:
App characteristics
Text Features from descriptions:
In [6]:
# datetime manipulation
apps_df['update_date'] = pd.to_datetime(apps_df['update_date'])
apps_df['publish_date'] = map(lambda x: datetime.datetime.strptime(x.split(' ')[0], '%Y-%m-%d'), apps_df['publish_date'])
apps_df['time_span'] = map(lambda x,y: np.log((x-y).days+1), apps_df['update_date'], apps_df['publish_date'])
In [7]:
# string object manipulation
rating_cleaned = {'1 star':1, "1 and a half stars": 1.5, '2 stars': 2, '2 and a half stars':2.5, "3 stars":3, "3 and a half stars":3.5, "4 stars": 4,
'4 and a half stars': 4.5, "5 stars": 5}
apps_df.overall_rating = apps_df.overall_rating.replace(rating_cleaned)
apps_df['is_free'] = apps_df['price'].apply(lambda x: int(x=='Free'))
apps_df['app_size']= apps_df['size'].str.split(' ').apply(lambda x: np.log(float(x[0])))
apps_df['num_overall_rating'] = apps_df['num_overall_rating'].apply(lambda x: np.log(x))
In [8]:
# helper function for NLP
def initial_clean_text(raw):
# use the decode method to convert to ascii (textblob prefers ascii)
raw = raw.decode('ascii', errors="replace")
#remove link
raw_no_link = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', raw)
#remove email
no_link_email = re.sub(r'([\w\.-]+)?@[\w\.-]+([\.][com|org|ch|uk]{2,3})?', "", raw_no_link)
#remove whitespace & special characters
tab_text = '\t\n\r\x0b\x0c*-'
no_link_email_space = "".join([ch for ch in no_link_email if ch not in set(tab_text)])
row = no_link_email_space.lower()
return row
# characters count
def char_num(text, ignore_spaces=True):
if ignore_spaces:
text = text.replace(" ", "")
text = "".join(x for x in text if x not in list(set(string.punctuation)))
return np.log(len(text)+1)
# words count
def word_num(text):
text = "".join(x for x in text if x not in list(set(string.punctuation)))
text = word_tokenize(text)
return np.log(len(text)+1)
# subjectivity
def get_sentiment(text):
sent = TextBlob(text)
polarity = sent.sentiment.polarity
subjectivity = sent.sentiment.subjectivity
return polarity, subjectivity
In [9]:
apps_df['description'] = apps_df['description'].apply(initial_clean_text)
apps_df['new_version_desc'] = apps_df['new_version_desc'].apply(initial_clean_text)
apps_df['description_charlen'] = apps_df['description'].apply(char_num)
apps_df['description_wordlen'] = apps_df['description'].apply(word_num)
apps_df['new_version_desc_charlen'] = apps_df['new_version_desc'].apply(char_num)
apps_df['new_version_desc_wordlen'] = apps_df['new_version_desc'].apply(word_num)
apps_df['description_polarity'], apps_df['description_subjectivity'] = zip(*apps_df['description'].map(get_sentiment))
apps_df['new_version_desc_polarity'], apps_df['new_version_desc_subjectivity'] = zip(*apps_df['new_version_desc'].map(get_sentiment))
In [10]:
apps_df.head().T
Out[10]:
Aftet we clean the data and feature extraction, we are ready for building our machine learning model. Since the target variable current_rating
is a continuous variable, it's obviously a regression problem. With the fantastic scikit-learn
package, we will use the ElasticNet
and RandomForestRegressor
models since they are useful for controlling overfitting and have fast computation. First, we will experiment and compare these two models with only the numerical features that we extracted from the last step. Furthermore, we could combine the word representation
features into the pipeline with FeatureUnion
to test whether we could improve our accuracy.
In [23]:
# one-hot encoder for multi-category variable
appcategory_vars = pd.get_dummies(apps_df['category'])
appcategory_columns = appcategory_vars.columns
apps_df = pd.concat([apps_df, appcategory_vars], axis =1)
In [29]:
appcategory_columns
Out[29]:
In [76]:
model_columns = ['overall_rating', 'num_overall_rating', 'is_InAppPurcased', 'is_multilingual', 'is_multiplatform','is_free',\
'app_size', 'time_span', 'description_charlen', 'description_wordlen', 'new_version_desc_charlen', 'new_version_desc_wordlen',\
'description_polarity', 'new_version_desc_polarity','description_subjectivity','new_version_desc_subjectivity'] + list(appcategory_columns)
X_train, X_test, y_train, y_test = train_test_split(apps_df[model_columns], apps_df['current_rating'], test_size = 0.33, random_state = 42)
In [30]:
print X_train.shape, X_test.shape
We use the build-in cross validation for training ElasticNet.
In [37]:
glm = ElasticNetCV(l1_ratio= np.array([0.5, 0.7, .9, .95, .98, 1]), cv = 5)
glm.fit(X_train, y_train)
Out[37]:
In [38]:
print "best lasso ratio: ", glm.l1_ratio_, "in pentalty terms:", glm.alpha_
In [50]:
def report_metrics(actuals, preds):
print "mean absolute error: ", metrics.mean_absolute_error(actuals, preds)
print "mean square error: ", metrics.mean_squared_error(actuals, preds)
print "median absolute error: ", metrics.median_absolute_error(actuals, preds)
print "R square: ", metrics.r2_score(actuals, preds)
In [51]:
glm_preds = glm.predict(X_test)
report_metrics(y_test, glm_preds)
In [64]:
rfr = RandomForestRegressor(n_estimators=200, n_jobs=-1, max_depth= 6)
rfr.fit(X_train, y_train)
Out[64]:
In [65]:
rfr_preds = rfr.predict(X_test)
report_metrics(y_test, rfr_preds)
In [68]:
# grid search
pg = {'max_depth': [4, 6, 8, 10], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 4], 'min_samples_leaf': [1, 3, 5 ]}
grid = GridSearchCV(rfr, param_grid=pg, cv=5)
grid.fit(X_train, y_train)
Out[68]:
In [69]:
print "best parameters: ", grid.best_params_, ' with score ', grid.best_score_
In [72]:
grfr_preds = grid.best_estimator_.predict(X_test)
report_metrics(y_test, grfr_preds) # a little gains through grid search optimization
In [79]:
#show to plot importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(grid.best_estimator_.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.plot.bar()
Out[79]:
We know the description usually tells the features of the apps. We can incorporate the raw text data and see whether we can improve our model performance, which could indicate the desciption has some predictive power for user experiences and app quality.
In [89]:
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
verb_exp = ['VB', 'VBZ', 'VBP', 'VBD','VBN','VBG']
def clean_tokenize_text(row):
#remove numbers
raw = re.sub('[0-9]+?', ' ', row)
# remove punctuation
regex = re.compile('[%s]' % re.escape(string.punctuation))
raw = regex.sub(' ', raw)
#clean out the characters left out after the above step, like we’re, I’m, It’s, i.e.
raw = re.sub('( s )|( re )|( m )|( i e )',' ',raw)
# lementize
row_t = TextBlob(raw)
tokens = []
for word, pos in row_t.tags:
if pos in verb_exp:
word = Word(word)
word = word.lemmatize("v")
else:
word = Word(word)
word = word.lemmatize() |
if word not in STOPLIST:
tokens.append(word)# remove stop words
return(tokens)
class ColumnSelector(TransformerMixin):
"""
Class for building sklearn Pipeline step. This class should be used to select a column from a pandas data frame.
"""
def __init__(self, columns):
self.columns = columns
def fit(self, x, y=None):
return self
def transform(self, data_frame):
return data_frame[self.columns]
In [112]:
new_columns = model_columns+['description']
X_train, X_test, y_train, y_test = train_test_split(apps_df[new_columns], apps_df['current_rating'], test_size = 0.33, random_state = 42)
In [121]:
random.seed(27)
vectorizer = TfidfVectorizer(tokenizer=clean_tokenize_text, min_df=0.03, max_features = 20000, ngram_range = (1, 2), )#2 grams
select_columns = ColumnSelector(columns = model_columns)
rfr_model = RandomForestRegressor(n_estimators=300, n_jobs=-1, max_depth= 6, min_samples_leaf= 5, min_samples_split = 2)
In [125]:
get_wordvec = Pipeline([('select_desc',ColumnSelector(['description'])), ('word_vec', vectorizer)])
ml_pipe = Pipeline([
('features', FeatureUnion([('wordvec_features', get_wordvec),
('numerical_features', select_columns)
])),
('rf_regressor', rfr_model)
])
In [123]:
get_wordvec.fit(X_train, y_train)
Out[123]:
In [124]:
get_wordvec.transform(X_test)
Out[124]: