In [51]:
from gensim import corpora, models, similarities

documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

print(documents)


['Human machine interface for lab abc computer applications', 'A survey of user opinion of computer system response time', 'The EPS user interface management system', 'System and human system engineering testing of EPS', 'Relation of user perceived response time to error measurement', 'The generation of random binary unordered trees', 'The intersection graph of paths in trees', 'Graph minors IV Widths of trees and well quasi ordering', 'Graph minors A survey']

In [6]:
stoplist = set('for a of and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

In [9]:
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)

texts = [[word for word in text if word not in tokens_once] for text in texts]

In [11]:
print(texts)


[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['the', 'eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['the', 'trees'], ['the', 'graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]

In [13]:
dictionary = corpora.Dictionary(texts)
print(dictionary)


Dictionary(13 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...)

In [15]:
print(dictionary.token2id)


{u'minors': 12, u'graph': 11, u'system': 5, u'trees': 10, u'eps': 8, u'computer': 0, u'survey': 4, u'user': 7, u'human': 1, u'time': 6, u'interface': 2, u'the': 9, u'response': 3}

In [20]:
new_doc = "Human Computer Interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)


[(0, 1), (1, 1)]

In [24]:
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)


[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1), (9, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(10, 1), (11, 1), (12, 1)], [(4, 1), (11, 1), (12, 1)]]

In [1]:
import json
rawTweets = []
for line in open("ebolaTweets.20141013-155619.json"):
    try: 
        rawTweets.append(json.loads(line))
    except:
        pass


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-1-630a33569f9e> in <module>()
      1 import json
      2 rawTweets = []
----> 3 for line in open("ebolaTweets.20141013-155619.json"):
      4     try:
      5         rawTweets.append(json.loads(line))

IOError: [Errno 2] No such file or directory: 'ebolaTweets.20141013-155619.json'

In [37]:
print len(rawTweets)
rawTweets[0]['text']

tweets = [tweet['text'] for tweet in rawTweets]
print len(tweets)


20000
20000

In [2]:
tweets[10]
stopwords = ['for', 'if','was','a', 'and', 'the', 'of', 'to', 'in']
for line in open("../analysis/stoplist.txt"):
    try:
        stopwords.append(line)
    except:
        pass
stopwords.append('u')
len(stopwords)
print(tweets[0].lower().split())


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-25e286da5064> in <module>()
----> 1 tweets[10]
      2 stopwords = ['for', 'if','was','a', 'and', 'the', 'of', 'to', 'in']
      3 for line in open("../analysis/stoplist.txt"):
      4     try:
      5         stopwords.append(line)

NameError: name 'tweets' is not defined

In [65]:
a = tweets[0]
print a
#for i in tweets:
text = [[word for word in tweet.lower().split() if word not in stopwords]
            for tweet in tweets]
print text[0]


RT @InbetweenReact: If Ebola was a outfit..... http://t.co/sTcQ5VibUq
[u'rt', u'@inbetweenreact:', u'if', u'ebola', u'was', u'outfit.....', u'http://t.co/stcq5vibuq']

In [ ]:
print(text[1:10])
all_tokens = sum(text,[])
tokens_once_2 = set(word for word in set(all_tokens) if all_tokens.count(word) ==1)
text = [[word for word in text if word not in tokens_once_2]
        for tweet in text]

In [75]:
ebolaDictionary = corpora.Dictionary(text)
dictionary.save('../analysis/testEbolaDict.dict')
print(ebolaDictionary)

ebola_corpus = [dictionary.doc2bow(item) for item in text]
corpora.MmCorpus.serialize('../analysis/testEbolaCorpus.mm', ebola_corpus)
print(ebola_corpus[0])


Dictionary(29036 unique tokens: [u'#sportsnews', u'#epidemic', u'#wherearethemasks', u'woods', u'spiders']...)
[]

In [83]:
lsi = models.LsiModel(ebola_corpus, id2word=ebolaDictionary, num_topics=5)

In [86]:
lsi.print_topics(10)


Out[86]:
[u'-0.996*"was" + -0.084*"ebola" + -0.000*"rt" + -0.000*"if" + -0.000*"@inbetweenreact:" + -0.000*"outfit....." + 0.000*"against" + 0.000*"http://t.co/stcq5vibuq" + 0.000*"af." + 0.000*"http://t.co/v2r0v6ch29"',
 u'-1.000*"if" + -0.000*"ebola" + 0.000*"was" + -0.000*"rt" + -0.000*"against" + -0.000*"http://t.co/stcq5vibuq" + -0.000*"@inbetweenreact:" + 0.000*"outfit....." + 0.000*"af." + 0.000*"http://t.co/v2r0v6ch29"',
 u'0.996*"ebola" + -0.084*"was" + 0.012*"rt" + -0.000*"if" + 0.000*"http://t.co/stcq5vibuq" + 0.000*"@inbetweenreact:" + -0.000*"against" + 0.000*"outfit....." + 0.000*"af." + 0.000*"http://t.co/v2r0v6ch29"',
 u'-1.000*"rt" + 0.012*"ebola" + -0.001*"was" + 0.000*"outfit....." + -0.000*"http://t.co/stcq5vibuq" + -0.000*"@inbetweenreact:" + 0.000*"against" + 0.000*"if" + 0.000*"af." + 0.000*"http://t.co/v2r0v6ch29"',
 u'1.000*"outfit....." + 0.000*"rt" + -0.000*"against" + -0.000*"@inbetweenreact:" + 0.000*"http://t.co/stcq5vibuq" + -0.000*"ebola" + 0.000*"was" + 0.000*"if" + 0.000*"af." + 0.000*"http://t.co/v2r0v6ch29"']

In [89]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

In [96]:
my_document_features = TfidfVectorizer().fit_transform(tweets)

km = KMeans(n_clusters = 8).fit(my_document_features)

In [100]:
print(km.labels_)
labels = km.labels_
centroids = km.cluster_centers_


[5 5 4 ..., 4 4 4]

In [103]:
from matplotlib import pyplot
import numpy as np

for i in range(8):
    # select only data observations with cluster label == i
    ds = my_document_features[np.where(labels==i)]
    # plot the data observations
    pyplot.plot(ds[:,0],ds[:,1],'o')
    # plot the centroids
    lines = pyplot.plot(centroids[i,0],centroids[i,1],'kx')
    # make the centroid x's bigger
    pyplot.setp(lines,ms=15.0)
    pyplot.setp(lines,mew=2.0)
pyplot.show()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-103-772b7f7711dc> in <module>()
      6     ds = my_document_features[np.where(labels==i)]
      7     # plot the data observations
----> 8     pyplot.plot(ds[:,0],ds[:,1],'o')
      9     # plot the centroids
     10     lines = pyplot.plot(centroids[i,0],centroids[i,1],'kx')

/usr/lib/pymodules/python2.7/matplotlib/pyplot.pyc in plot(*args, **kwargs)
   2985         ax.hold(hold)
   2986     try:
-> 2987         ret = ax.plot(*args, **kwargs)
   2988         draw_if_interactive()
   2989     finally:

/usr/lib/pymodules/python2.7/matplotlib/axes.pyc in plot(self, *args, **kwargs)
   4136 
   4137         for line in self._get_lines(*args, **kwargs):
-> 4138             self.add_line(line)
   4139             lines.append(line)
   4140 

/usr/lib/pymodules/python2.7/matplotlib/axes.pyc in add_line(self, line)
   1495             line.set_clip_path(self.patch)
   1496 
-> 1497         self._update_line_limits(line)
   1498         if not line.get_label():
   1499             line.set_label('_line%d' % len(self.lines))

/usr/lib/pymodules/python2.7/matplotlib/axes.pyc in _update_line_limits(self, line)
   1506         Figures out the data limit of the given line, updating self.dataLim.
   1507         """
-> 1508         path = line.get_path()
   1509         if path.vertices.size == 0:
   1510             return

/usr/lib/pymodules/python2.7/matplotlib/lines.pyc in get_path(self)
    741         """
    742         if self._invalidy or self._invalidx:
--> 743             self.recache()
    744         return self._path
    745 

/usr/lib/pymodules/python2.7/matplotlib/lines.pyc in recache(self, always)
    418                 x = ma.asarray(xconv, np.float_)
    419             else:
--> 420                 x = np.asarray(xconv, np.float_)
    421             x = x.ravel()
    422         else:

/usr/local/lib/python2.7/dist-packages/numpy/core/numeric.pyc in asarray(a, dtype, order)
    460 
    461     """
--> 462     return array(a, dtype, copy=False, order=order)
    463 
    464 def asanyarray(a, dtype=None, order=None):

ValueError: setting an array element with a sequence.