In [51]:
from gensim import corpora, models, similarities
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
print(documents)
In [6]:
stoplist = set('for a of and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
In [9]:
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]
In [11]:
print(texts)
In [13]:
dictionary = corpora.Dictionary(texts)
print(dictionary)
In [15]:
print(dictionary.token2id)
In [20]:
new_doc = "Human Computer Interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)
In [24]:
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)
In [1]:
import json
rawTweets = []
for line in open("ebolaTweets.20141013-155619.json"):
try:
rawTweets.append(json.loads(line))
except:
pass
In [37]:
print len(rawTweets)
rawTweets[0]['text']
tweets = [tweet['text'] for tweet in rawTweets]
print len(tweets)
In [2]:
tweets[10]
stopwords = ['for', 'if','was','a', 'and', 'the', 'of', 'to', 'in']
for line in open("../analysis/stoplist.txt"):
try:
stopwords.append(line)
except:
pass
stopwords.append('u')
len(stopwords)
print(tweets[0].lower().split())
In [65]:
a = tweets[0]
print a
#for i in tweets:
text = [[word for word in tweet.lower().split() if word not in stopwords]
for tweet in tweets]
print text[0]
In [ ]:
print(text[1:10])
all_tokens = sum(text,[])
tokens_once_2 = set(word for word in set(all_tokens) if all_tokens.count(word) ==1)
text = [[word for word in text if word not in tokens_once_2]
for tweet in text]
In [75]:
ebolaDictionary = corpora.Dictionary(text)
dictionary.save('../analysis/testEbolaDict.dict')
print(ebolaDictionary)
ebola_corpus = [dictionary.doc2bow(item) for item in text]
corpora.MmCorpus.serialize('../analysis/testEbolaCorpus.mm', ebola_corpus)
print(ebola_corpus[0])
In [83]:
lsi = models.LsiModel(ebola_corpus, id2word=ebolaDictionary, num_topics=5)
In [86]:
lsi.print_topics(10)
Out[86]:
In [89]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
In [96]:
my_document_features = TfidfVectorizer().fit_transform(tweets)
km = KMeans(n_clusters = 8).fit(my_document_features)
In [100]:
print(km.labels_)
labels = km.labels_
centroids = km.cluster_centers_
In [103]:
from matplotlib import pyplot
import numpy as np
for i in range(8):
# select only data observations with cluster label == i
ds = my_document_features[np.where(labels==i)]
# plot the data observations
pyplot.plot(ds[:,0],ds[:,1],'o')
# plot the centroids
lines = pyplot.plot(centroids[i,0],centroids[i,1],'kx')
# make the centroid x's bigger
pyplot.setp(lines,ms=15.0)
pyplot.setp(lines,mew=2.0)
pyplot.show()