In [2]:
import graphlab
In [3]:
import graphlab as gl
from IPython.display import display
from IPython.display import Image
gl.canvas.set_target('ipynb')
In [5]:
traindata_path = "/Users/zhangyixin/Desktop/cjc2016-gh-pages/labeledTrainData.tsv"
testdata_path = "/Users/zhangyixin/Desktop/cjc2016-gh-pages/testData.tsv"
In [6]:
import graphlab as gl
graphlab.product_key.set_product_key('7D9A-5351-5A47-786A-941D-38C6-2885-46EA')
train_data = gl.SFrame.read_csv(traindata_path,header=True,
delimiter='\t',quote_char='"',
column_type_hints = {'id':str,
'sentiment' : int,
'review':str } )
train_data['1grams features'] = gl.text_analytics.count_ngrams(
train_data['review'],1)
train_data['2grams features'] = gl.text_analytics.count_ngrams(
train_data['review'],2)
cls = gl.classifier.create(train_data, target='sentiment',
features=['1grams features','2grams features'])
In [7]:
movies_reviews_data = gl.SFrame.read_csv(traindata_path,header=True, delimiter='\t',quote_char='"',
column_type_hints = {'id':str, 'sentiment' : str, 'review':str } )
In [8]:
movies_reviews_data.show()
In [9]:
movies_reviews_data['1grams features'] = gl.text_analytics.count_ngrams(movies_reviews_data ['review'],1)
In [10]:
movies_reviews_data.show(['review','1grams features'])
In [11]:
train_set, test_set = movies_reviews_data.random_split(0.8, seed=5)
In [12]:
model_1 = gl.classifier.create(train_set, target='sentiment', features=['1grams features'])
In [13]:
result1 = model_1.evaluate(test_set)
In [14]:
def print_statistics(result):
print "*" * 30
print "Accuracy : ", result["accuracy"]
print "Confusion Matrix: \n", result["confusion_matrix"]
print_statistics(result1)
In [15]:
movies_reviews_data['2grams features'] = gl.text_analytics.count_ngrams(movies_reviews_data['review'],2)
In [16]:
train_set, test_set = movies_reviews_data.random_split(0.8, seed=5)
model_2 = gl.classifier.create(train_set, target='sentiment', features=['1grams features','2grams features'])
result2 = model_2.evaluate(test_set)
print_statistics(result2)
In [17]:
traindata_path = "/Users/zhangyixin/Desktop/cjc2016-gh-pages/labeledTrainData.tsv"
testdata_path = "/Users/zhangyixin/Desktop/cjc2016-gh-pages/testData.tsv"
#creating classifier using all 25,000 reviews
train_data = gl.SFrame.read_csv(traindata_path,header=True, delimiter='\t',quote_char='"',
column_type_hints = {'id':str, 'sentiment' : int, 'review':str } )
train_data['1grams features'] = gl.text_analytics.count_ngrams(train_data['review'],1)
train_data['2grams features'] = gl.text_analytics.count_ngrams(train_data['review'],2)
cls = gl.classifier.create(train_data, target='sentiment', features=['1grams features','2grams features'])
#creating the test dataset
test_data = gl.SFrame.read_csv(testdata_path,header=True, delimiter='\t',quote_char='"',
column_type_hints = {'id':str, 'review':str } )
test_data['1grams features'] = gl.text_analytics.count_ngrams(test_data['review'],1)
test_data['2grams features'] = gl.text_analytics.count_ngrams(test_data['review'],2)
#predicting the sentiment of each review in the test dataset
test_data['sentiment'] = cls.classify(test_data)['class'].astype(int)
#saving the prediction to a CSV for submission
test_data[['id','sentiment']].save("/Users/zhangyixin/Desktop/cjc2016-gh-pages/predictions.csv", format="csv")
In [18]:
%matplotlib inline
from __future__ import print_function
from wordcloud import WordCloud
from gensim import corpora, models, similarities, matutils
import matplotlib.pyplot as plt
import numpy as np
In [19]:
corpus = corpora.BleiCorpus('/Users/zhangyixin/Desktop/cjc2016-gh-pages/ap/ap.dat', '/Users/zhangyixin/Desktop/cjc2016-gh-pages/ap/vocab.txt')
In [20]:
' '.join(dir(corpus))
Out[20]:
In [21]:
corpus.id2word.items()[:3]
Out[21]:
In [22]:
NUM_TOPICS = 100
In [25]:
model = models.ldamodel.LdaModel(
corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=None)
In [26]:
' '.join(dir(model))
Out[26]:
In [27]:
document_topics = [model[c] for c in corpus]
In [28]:
document_topics[2]
Out[28]:
In [29]:
model.show_topic(0, 10)
Out[29]:
In [30]:
model.show_topic(99, 10)
Out[30]:
In [31]:
words = model.show_topic(0, 5)
words
Out[31]:
In [32]:
model.show_topics(4)
Out[32]:
In [33]:
for f, w in words[:10]:
print(f)
In [34]:
words = model.show_topic(0, 10)
for (f, w) in words:
print(w)
In [35]:
for f, w in words:
print(f + '\t' + str(f))
In [36]:
for ti in range(model.num_topics):
words = model.show_topic(ti, 10)
tf = sum(w for f, w in words)
with open('/Users/zhangyixin/Desktop/cjc2016-gh-pages/data/topics_term_weight.txt', 'a') as output:
for f, w in words:
line = str(ti) + '\t' + f+ '\t' + str(w/tf)
output.write(line + '\n')
In [37]:
topics = matutils.corpus2dense(model[corpus], num_terms=model.num_topics)
weight = topics.sum(1)
max_topic = weight.argmax()
In [38]:
words = model.show_topic(max_topic, 64)
words = np.array(words).T
words[1]
Out[38]:
In [39]:
words = model.show_topic(max_topic, 64)
words = np.array(words).T
words_freq=[float(i)*10000000 for i in words[1]]
words = zip(words[0], words_freq)
In [40]:
wordcloud = WordCloud().generate_from_frequencies(words)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [41]:
num_topics_used = [len(model[doc]) for doc in corpus]
fig,ax = plt.subplots()
ax.hist(num_topics_used, np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')
fig.tight_layout()
#fig.savefig('Figure_04_01.png')
In [42]:
ALPHA = 1.0
model1 = models.ldamodel.LdaModel(
corpus, num_topics=NUM_TOPICS, id2word=corpus.id2word, alpha=ALPHA)
num_topics_used1 = [len(model1[doc]) for doc in corpus]
In [43]:
fig,ax = plt.subplots()
ax.hist([num_topics_used, num_topics_used1], np.arange(42))
ax.set_ylabel('Nr of documents')
ax.set_xlabel('Nr of topics')
# The coordinates below were fit by trial and error to look good
plt.text(9, 223, r'default alpha')
plt.text(26, 156, 'alpha=1.0')
fig.tight_layout()
In [44]:
with open('/Users/zhangyixin/Desktop/cjc2016-gh-pages/ap/ap.txt', 'r') as f:
dat = f.readlines()
In [45]:
dat[:6]
Out[45]:
In [46]:
dat[4].strip()[0]
Out[46]:
In [47]:
docs = []
for i in dat[:100]:
if i.strip()[0] != '<':
docs.append(i)
In [48]:
def clean_doc(doc):
doc = doc.replace('.', '').replace(',', '')
doc = doc.replace('``', '').replace('"', '')
doc = doc.replace('_', '').replace("'", '')
doc = doc.replace('!', '')
return doc
docs = [clean_doc(doc) for doc in docs]
In [49]:
texts = [[i for i in doc.lower().split()] for doc in docs]
In [50]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
In [51]:
' '.join(stop)
Out[51]:
In [52]:
stop.append('said')
In [53]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1 and token not in stop]
for text in texts]
In [54]:
docs[8]
Out[54]:
In [55]:
' '.join(texts[9])
Out[55]:
In [56]:
dictionary = corpora.Dictionary(texts)
lda_corpus = [dictionary.doc2bow(text) for text in texts]
In [57]:
lda_model = models.ldamodel.LdaModel(
lda_corpus, num_topics=NUM_TOPICS, id2word=dictionary, alpha=None)
In [58]:
import pyLDAvis.gensim
ap_data = pyLDAvis.gensim.prepare(lda_model, lda_corpus, dictionary)
In [59]:
pyLDAvis.enable_notebook()
pyLDAvis.display(ap_data)
Out[59]:
In [60]:
pyLDAvis.save_html(ap_data, '/Users/zhangyixin/Desktop/cjc2016-gh-pages/vis/ap_ldavis.html')
In [ ]: