In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('ggplot')
%matplotlib inline
In [2]:
app = pd.read_pickle('app_cleaned.pickle')
In [3]:
app = app.drop_duplicates()
In [4]:
app = app.dropna(axis = 0)#remove the NAN
In [5]:
app.head()
Out[5]:
In [8]:
ratio = app['num_current_rating']/app['num_overall_rating']
In [9]:
#use histogram to show the range of ratio
plt.hist(ratio,bins = 20, alpha = .4, label = 'ratio')
plt.legend()
plt.show()
according to the histogram, the ratios are mainly under 0.2.
In [10]:
index = ratio>0.05#get the index of ratio larger than 0.05
In [11]:
appfilter = app.loc[index]#filter the apps which number of current rating over number of overall rating larger than 0.1
In [12]:
#use histogram to show the range of current_rating-overall_rating
plt.hist(appfilter['current_rating']-appfilter['overall_rating'],bins = 20, alpha = .4, label = 'diff')
plt.legend()
plt.show()
In [13]:
diff = appfilter['current_rating']-appfilter['overall_rating']
In [14]:
index2 = diff>=0.1#get the index of the difference larger than 0.1
index2b = diff<= -0.1#get the index of the difference smaller than -0.1
In [15]:
appinprove = appfilter.loc[index2]
appdecrease = appfilter.loc[index2b]
In [16]:
nvd = appinprove['new_version_desc']
nvdd = appdecrease['new_version_desc']
In [17]:
#compile documents
doc_complete = nvd.tolist()
doc_complete2 = nvdd.tolist()
In [18]:
#clean doc
import nltk
from nltk import corpus
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
stemmer = PorterStemmer().stem
tokenize = nltk.word_tokenize
stop = stopwords.words('english')+list(string.punctuation)+['we','new','fix','io','updat','improv','bug',
'app','featur','perform','ad',"\'s","--","us"
,"minor","support","iphon","issu","add","enhanc",
"user","pleas","10","7","experi","thank",
"version","experi","screen","\'\'","2","6","icon",
"stabil","review","5","``"]
def stem(tokens,stemmer = PorterStemmer().stem):
stemwords = [stemmer(w.lower()) for w in tokens if w not in stop]
return [w for w in stemwords if w not in stop]
def lemmatize(text):
return stem(tokenize(text))
In [19]:
doc_clean = [lemmatize(doc) for doc in doc_complete]
doc_clean2 = [lemmatize(doc) for doc in doc_complete2]
In [20]:
# Importing Gensim
import gensim
from gensim import corpora
# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)
dictionary2 = corpora.Dictionary(doc_clean2)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
doc_term_matrix2 = [dictionary2.doc2bow(doc) for doc in doc_clean2]
In [21]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
ldamodel2 = Lda(doc_term_matrix2, num_topics=3, id2word = dictionary2, passes=50)
In [22]:
print(ldamodel.print_topics(num_topics=3, num_words=3))
print(ldamodel2.print_topics(num_topics=3, num_words=3))
Improved app
In [185]:
index_interfac = []
for i in range(len(doc_clean)):
if 'interfac' in doc_clean[i]:
index_interfac.append(True)
else:
index_interfac.append(False)
In [187]:
nvd[index_interfac][1342]
Out[187]:
In [188]:
index_feedback = []
for i in range(len(doc_clean)):
if 'feedback' in doc_clean[i]:
index_feedback.append(True)
else:
index_feedback.append(False)
In [190]:
nvd[index_feedback][193]
Out[190]:
In [192]:
index_store = []
for i in range(len(doc_clean)):
if 'store' in doc_clean[i]:
index_store.append(True)
else:
index_store.append(False)
In [241]:
nvd[index_store][1024]
Out[241]:
improved pattern:
1.some improvements on interface
2.ask for feedbacks
3.ask for reviews on app store
Worsen app
In [214]:
index_ipad = []
for i in range(len(doc_clean2)):
if 'ipad' in doc_clean2[i]:
index_ipad.append(True)
else:
index_ipad.append(False)
In [220]:
nvdd[index_ipad][1373]
Out[220]:
In [222]:
index_music = []
for i in range(len(doc_clean2)):
if 'music' in doc_clean2[i]:
index_music.append(True)
else:
index_music.append(False)
In [232]:
nvdd[index_music][2157]
Out[232]:
In [234]:
index_card = []
for i in range(len(doc_clean2)):
if 'card' in doc_clean2[i]:
index_card.append(True)
else:
index_card.append(False)
In [239]:
nvdd[index_card][646]
Out[239]:
woren pattern:
1.add more features on ipad version
2.add more features related to music function
3.the apps are designed for cards
In [24]:
import pyLDAvis.gensim
In [25]:
pyLDAvis.enable_notebook()
dec_improv = pyLDAvis.gensim.prepare(ldamodel,doc_term_matrix, dictionary)
dec_decrea = pyLDAvis.gensim.prepare(ldamodel2,doc_term_matrix2, dictionary2)
In [26]:
dec_improv
Out[26]:
In [27]:
pyLDAvis.save_html(dec_improv,'improved_apps.html')
In [28]:
dec_decrea
Out[28]:
In [28]:
pyLDAvis.save_html(dec_decrea,'worsen_apps.html')
In [ ]: