In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('ggplot')
%matplotlib inline

In [2]:
app = pd.read_pickle('app_cleaned.pickle')

In [3]:
app = app.drop_duplicates()

In [4]:
app = app.dropna(axis = 0)#remove the NAN

In [5]:
app.head()


Out[5]:
category current_rating description id is_InAppPurcased is_multilingual is_multiplatform name new_version_desc num_current_rating ... review2 review2_star review3 review3_star scrape_date seller size update_date url version
0 Finance 1.80000 The JPay App lets you send money and email to ... 584959322 0 0 0 JPay This update has a big new feature for the new ... 20.0 ... What is wrong with this freakin App? It keep s... 1.0 Emails are not sent and received in a timely m... 1.0 2017-03-13 JPay, Inc. 13.0 MB Jan 20, 2017 https://itunes.apple.com/us/app/jpay/id5849593... 4.7
2 Finance 4.64497 Access your interactive Experian Credit Report... 1087101090 1 0 0 Experian - Free Credit Report Bug fixes and other minor updates 169.0 ... This Experian app helps me measure my credit a... 5.0 I don't exactly like credit bureaus my experie... 5.0 2017-03-13 CONSUMERINFO.COM, INC. 58.0 MB Feb 22, 2017 https://itunes.apple.com/us/app/experian-free-... 1.6.1
3 Finance 2.16667 Use the Vanguard app to check your accounts an... 335186209 0 0 0 Vanguard See what's new! Have an IRA? Watch your progr... 30.0 ... The app is very easy to use and has tons of he... 5.0 My company used to use Fidelity for our retire... 1.0 2017-03-13 The Vanguard Group, Inc. 49.0 MB Jan 22, 2017 https://itunes.apple.com/us/app/vanguard/id335... 7.1
4 Finance 4.60773 Conveniently manage your credit card account f... 1128712763 0 0 0 Credit One Bank Mobile - Ability add an additional account if qualifi... 724.0 ... I am so fed up with this entire company! I'm g... 2.0 I have no complaints, well one but its not a b... 5.0 2017-03-13 Credit One Bank, National Association 42.8 MB Jan 31, 2017 https://itunes.apple.com/us/app/credit-one-ban... 1.4
5 Finance 4.34066 Be the first to know about news and market mov... 552799694 1 0 0 Seeking Alpha Portfolio Performance improvements 91.0 ... SA offers a lot of information supplied by art... 5.0 I'm new to learning about stocks and managing ... 5.0 2017-03-13 Seeking Alpha Ltd. 46.2 MB Feb 14, 2017 https://itunes.apple.com/us/app/seeking-alpha-... 3.3.7

5 rows × 26 columns


In [8]:
ratio = app['num_current_rating']/app['num_overall_rating']

In [9]:
#use histogram to show the range of ratio
plt.hist(ratio,bins = 20, alpha = .4, label = 'ratio')
plt.legend()
plt.show()


according to the histogram, the ratios are mainly under 0.2.


In [10]:
index = ratio>0.05#get the index of ratio larger than 0.05

In [11]:
appfilter = app.loc[index]#filter the apps which number of current rating over number of overall rating larger than 0.1

In [12]:
#use histogram to show the range of current_rating-overall_rating
plt.hist(appfilter['current_rating']-appfilter['overall_rating'],bins = 20, alpha = .4, label = 'diff')
plt.legend()
plt.show()



In [13]:
diff = appfilter['current_rating']-appfilter['overall_rating']

In [14]:
index2 = diff>=0.1#get the index of the difference larger than 0.1
index2b = diff<= -0.1#get the index of the difference smaller than -0.1

In [15]:
appinprove = appfilter.loc[index2]
appdecrease = appfilter.loc[index2b]

In [16]:
nvd = appinprove['new_version_desc']
nvdd = appdecrease['new_version_desc']

In [17]:
#compile documents
doc_complete = nvd.tolist()
doc_complete2 = nvdd.tolist()

In [18]:
#clean doc
import nltk
from nltk import corpus
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
stemmer = PorterStemmer().stem
tokenize = nltk.word_tokenize
stop = stopwords.words('english')+list(string.punctuation)+['we','new','fix','io','updat','improv','bug',
                                                            'app','featur','perform','ad',"\'s","--","us"
                                                            ,"minor","support","iphon","issu","add","enhanc",
                                                           "user","pleas","10","7","experi","thank",
                                                           "version","experi","screen","\'\'","2","6","icon",
                                                           "stabil","review","5","``"]
def stem(tokens,stemmer = PorterStemmer().stem):
    stemwords = [stemmer(w.lower()) for w in tokens if w not in stop]
    return [w for w in stemwords if w not in stop]
def lemmatize(text):
    return stem(tokenize(text))

In [19]:
doc_clean = [lemmatize(doc) for doc in doc_complete]
doc_clean2 = [lemmatize(doc) for doc in doc_complete2]

In [20]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)
dictionary2 = corpora.Dictionary(doc_clean2)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
doc_term_matrix2 = [dictionary2.doc2bow(doc) for doc in doc_clean2]

In [21]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
ldamodel2 = Lda(doc_term_matrix2, num_topics=3, id2word = dictionary2, passes=50)

In [22]:
print(ldamodel.print_topics(num_topics=3, num_words=3))
print(ldamodel2.print_topics(num_topics=3, num_words=3))


[(0, u'0.008*"store" + 0.007*"make" + 0.007*"help"'), (1, u'0.006*"feedback" + 0.006*"thi" + 0.005*"crash"'), (2, u'0.010*"includ" + 0.008*"crash" + 0.006*"use"')]
[(0, u'0.007*"includ" + 0.007*"-bug" + 0.006*"locat"'), (1, u'0.008*"help" + 0.006*"play" + 0.006*"ipad"'), (2, u'0.006*"crash" + 0.006*"use" + 0.006*"share"')]

Improved app


In [185]:
index_interfac = []
for i in range(len(doc_clean)):
    if 'interfac' in doc_clean[i]:
        index_interfac.append(True)
    else:
        index_interfac.append(False)

In [187]:
nvd[index_interfac][1342]


Out[187]:
u'Minor Interface Improvements'

In [188]:
index_feedback = []
for i in range(len(doc_clean)):
    if 'feedback' in doc_clean[i]:
        index_feedback.append(True)
    else:
        index_feedback.append(False)

In [190]:
nvd[index_feedback][193]


Out[190]:
u"- New truth or dare cards - Fixed bugs & improved design If you have any feedback or suggestions, tweet us @truthdareapp! We'd love to hear from you."

In [192]:
index_store = []
for i in range(len(doc_clean)):
    if 'store' in doc_clean[i]:
        index_store.append(True)
    else:
        index_store.append(False)

In [241]:
nvd[index_store][1024]


Out[241]:
u'- Bug fixes If you enjoy our apps, please leave us a review on the App Store. Reviews help us stay motivated to keep the updates coming. Thanks!'

improved pattern:

1.some improvements on interface

2.ask for feedbacks

3.ask for reviews on app store

Worsen app


In [214]:
index_ipad = []
for i in range(len(doc_clean2)):
    if 'ipad' in doc_clean2[i]:
        index_ipad.append(True)
    else:
        index_ipad.append(False)

In [220]:
nvdd[index_ipad][1373]


Out[220]:
u'Holy Quran Arabic text with side by side English translation by Maulvi Sher Ali. Retina display support for iPhone 4+. Universal app with iPad support. Swipe slides the pages left and right accordingly.'

In [222]:
index_music = []
for i in range(len(doc_clean2)):
    if 'music' in doc_clean2[i]:
        index_music.append(True)
    else:
        index_music.append(False)

In [232]:
nvdd[index_music][2157]


Out[232]:
u'- Introducing Music Share! After a quick over-the-air update for your headphones, you can stream music or audio with a friend between two Bose headphones at the same time. - Bug fixes and improvements'

In [234]:
index_card = []
for i in range(len(doc_clean2)):
    if 'card' in doc_clean2[i]:
        index_card.append(True)
    else:
        index_card.append(False)

In [239]:
nvdd[index_card][646]


Out[239]:
u'Based on your feedback weve made some enhancements and fixed bugs to help improve your experience.  Added swipe gesture to access the digital card from account summary. (Digital Card is available on most CareCredit accounts.)  Improvements on performance and stability. *We have heard some CareCredit customers are experiencing issues logging into the CareCredit App. Please know that our team is aware and we are looking for the best solution. We will be making regular updates to the app to improve your experience.'

woren pattern:

1.add more features on ipad version

2.add more features related to music function

3.the apps are designed for cards


In [24]:
import pyLDAvis.gensim

In [25]:
pyLDAvis.enable_notebook()
dec_improv = pyLDAvis.gensim.prepare(ldamodel,doc_term_matrix, dictionary)
dec_decrea = pyLDAvis.gensim.prepare(ldamodel2,doc_term_matrix2, dictionary2)

In [26]:
dec_improv


Out[26]:

In [27]:
pyLDAvis.save_html(dec_improv,'improved_apps.html')

In [28]:
dec_decrea


Out[28]:

In [28]:
pyLDAvis.save_html(dec_decrea,'worsen_apps.html')

In [ ]: