In [2]:
import json, datetime
def date_hook(json_dict):
    for (key, value) in json_dict.items():
        try:
            json_dict[key] = datetime.datetime.strptime(value,'%a %b %d %H:%M:%S +0000 %Y')
        except:
            pass
    return json_dict

In [2]:



Out[2]:
u'Sat Nov 01 06:21:05 +0000 2014'

In [3]:
import sys, glob, errno
datetweets = []
path = '../data/msftGoogAaplTweets.20141102-0[89]*.json'   
files = glob.glob(path)   
print len(files)
for name in files:
    try:
        for line in open(name): # No need to specify 'r': this is the default.
            datetweets.append(json.loads(line, object_hook=date_hook))
    except IOError as exc:
        if exc.errno != errno.EISDIR: # Do not fail if a directory is found, just ignore it.
            raise # Propag


#for line in open("../data/msftGoogAaplTweets.20141101*.json"):
#    try: 
#        datetweets.append(json.loads(line, object_hook=date_hook))
#    except:
#        pass


9

In [4]:
print len(datetweets)


180000

In [5]:
print datetweets[0]['created_at']
print datetweets[0]
print datetweets[len(datetweets)-1]['created_at']


2014-11-02 15:50:46
{u'lang': u'en', u'source': u'<a href="http://www.google.com" rel="nofollow">x5Tweet</a>', u'text': u'RT @FreebieCasinos: Rocky Slots - \xa315 Free No Deposit Bonus: http://t.co/vRAAAPLC3c via @YouTube', u'created_at': datetime.datetime(2014, 11, 2, 15, 50, 46), u'timestamp_ms': u'1414943446036', u'entities': {u'user_mentions': [{u'indices': [3, 18], u'screen_name': u'FreebieCasinos', u'id': 2427572190, u'name': u'Freebie Casinos', u'id_str': u'2427572190'}, {u'indices': [88, 96], u'screen_name': u'YouTube', u'id': 10228272, u'name': u'YouTube', u'id_str': u'10228272'}], u'symbols': [], u'trends': [], u'hashtags': [], u'urls': [{u'url': u'http://t.co/vRAAAPLC3c', u'indices': [61, 83], u'expanded_url': u'http://youtu.be/frzZ4TWI7sM', u'display_url': u'youtu.be/frzZ4TWI7sM'}]}, u'id_str': u'528937265825714177', u'filter_level': u'medium', u'id': 528937265825714177, u'user': {u'lang': u'en', u'verified': False, u'name': u'\u0627\u0628\u0648 \u0627\u0644\u063a\u064a\u0631\u0647', u'friends_count': 11252, u'created_at': datetime.datetime(2014, 2, 26, 18, 24, 4), u'time_zone': None, u'followers_count': 10237, u'screen_name': u'Abo_algherra', u'id_str': u'2367400210', u'is_translator': False, u'location': u''}}
2014-11-02 16:42:34

In [6]:
from datetime import datetime
start = datetime(2014, 11, 2, 15,30)
end = datetime(2014, 11, 2, 15, 59)

In [7]:
#for tweet in datetweets:
 #   if tweet['created_at'] >= start and tweet['created_at'] <= end:
  #      print tweet['created_at']
tweets = [tweet for tweet in datetweets if tweet['created_at'] >= start and tweet['created_at'] >= end ]

print len(tweets)


108234

In [13]:
tmp = tweets[0]
print tmp['text']
#tweets = [tweet['text'] for tweet in datetweets]
#text = [[word for word in tweet.lower().split() if word not in stopwords]
 #           for tweet in tweets]
[tmp['test'] word for word in tmp['text'].lower().split() ]
print tmp['test']


  File "<ipython-input-13-98db5a0b53c5>", line 6
    [tmp['test'] word for word in tmp['text'].lower().split() ]
                    ^
SyntaxError: invalid syntax

In [8]:
stopwords = ['for', 'if','was','a', 'and', 'the', 'of', 'to', 'in']
text = [[ word for word in tweet['text'].lower().split() if word not in stopwords]
                 for tweet in tweets]

In [15]:
print(text[1:10])
# this takes tome
all_tokens = sum(text,[])


[[u'rt', u'@foodpornsx:', u'deep', u'fried', u'mac', u'n', u'cheese', u'bites', u',.', u'http://t.co/gl0qgxb0bs'], [u'rt', u'@epickidfails:', u'when', u'you', u'lie', u'tell', u'bad', u'ass', u'kid', u'they', u'got', u'ebola\U0001f602\U0001f602\U0001f602\U0001f602', u'https://t.co/rnbltfsmrb'], [u'@aaliyahmendess', u'it', u'nust', u'annoy', u'you', u'having', u"'fans'", u'that', u'only', u'tweet', u'follow', u'ou', u'bc', u'shawn..', u'i', u'presonally', u'really', u'like', u'your', u'youtube', u'videos'], [u'https://t.co/nxw7bh47jl', u'inlove'], [u'watermat', u'2013', u'(the', u'flood).', u'https://t.co/tjw7hqrop6', u'http://t.co/tlfofrhk4f'], [u'rt', u'@appieofficiel:', u'apple', u'iglass', u'\U0001f4f1', u'http://t.co/d2vcocotqt'], [u'animals', u'-', u'house', u'rising', u'sun', u'(excellent', u'video', u'audio', u'qua...:', u'http://t.co/zaxwhkuwkm', u'via', u'@youtube'], [u'@cloudwook', u'news', u'&gt;', u'google', u'cloud', u"services'", u'new', u'partner:', u'pwc', u'-', u'newsfactor', u'network', u'http://t.co/o8q5lk287h'], [u'rt', u'@mashable:', u'number', u'one', u'rule', u'owning', u'google', u'glass:', u"don't", u'be', u'glasshole', u'http://t.co/y6n7aijkcm', u'http://t.co/xwos5ow4vv']]

In [16]:
tokens_once_2 = set(word for word in set(all_tokens) if all_tokens.count(word) ==1)

In [17]:
print len(tokens_once_2)
text = [[word for word in tweet if word not in tokens_once_2]
        for tweet in text]
        

#text = [[word for word in tweet.lower().split() if word not in stopwords]
 #           for tweet in tweets]

print text[0]


132779
[u'rt', u'@luckywolves:', u'{', u'pls', u'rt', u'}', u'#backto20s420k', u'\U0001f495', u'video', u'link:', u'https://t.co/dimun1h0sl', u'pls', u'rt', u':', u'https://t.co/wcq1of2m7x']

In [29]:
from gensim import corpora
from gensim import models
tempDict = corpora.Dictionary(text)

print(tempDict)

tmp_corpus = [tempDict.doc2bow(item) for item in text]

print(tmp_corpus[0])


Dictionary(187278 unique tokens: [u'more:http://t.co/nghu4yt9mo', u'ready!!', u'http://t.co/7mk7yucfl8', u'http://t.co/e8lippjo3o', u'sowell']...)
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 3), (8, 1), (9, 1), (10, 1), (11, 1)]

In [10]:
lsi = models.LsiModel(tmp_corpus, id2word=tempDict, num_topics=10)

In [11]:
lsi.print_topics(10)


Out[11]:
[u'0.455*"@youtube" + 0.450*"i" + 0.398*"video" + 0.382*"-" + 0.281*"liked" + 0.267*"rt" + 0.204*"from" + 0.104*"via" + 0.082*"you" + 0.078*"on"',
 u'0.784*"rt" + -0.231*"@youtube" + -0.174*"liked" + 0.172*"you" + -0.166*"video" + 0.157*"on" + -0.152*"i" + 0.140*"youtube" + 0.140*"is" + 0.117*"this"',
 u'-0.818*"-" + 0.318*"i" + 0.215*"video" + -0.161*"via" + 0.159*"liked" + 0.127*"from" + -0.114*"apple" + -0.092*"by" + -0.088*"full" + -0.085*"read"',
 u'0.432*"@youtube" + 0.418*"via" + -0.363*"you" + 0.348*"rt" + -0.281*"i" + -0.218*"my" + -0.161*"-" + -0.135*"on" + -0.126*"it" + -0.123*"is"',
 u'-0.497*"via" + -0.479*"you" + -0.340*"@youtube" + 0.247*"rt" + -0.221*"my" + 0.176*"liked" + 0.169*"from" + -0.134*"on" + 0.131*"i" + 0.122*"video"',
 u'-0.545*"youtube" + -0.460*"on" + -0.267*"get" + 0.259*"you" + -0.169*"off" + -0.165*"sexual" + -0.163*"abusers" + -0.145*"#twitition" + 0.139*"rt" + 0.120*"apple"',
 u'-0.539*"is" + 0.473*"you" + -0.236*"on" + 0.198*"get" + 0.196*"youtube" + -0.164*"it" + -0.155*"google" + -0.131*"that" + -0.129*"with" + 0.108*"off"',
 u'0.748*"\u2026" + 0.199*"video" + 0.190*"my" + -0.183*"i" + -0.172*"you" + 0.153*"me" + 0.147*"music" + -0.140*"apple" + 0.114*"give" + 0.113*"be"',
 u'0.416*"scientists" + 0.219*"are" + 0.217*"about" + 0.216*"do" + 0.214*"don\'t" + 0.211*"over" + 0.209*"say" + 0.209*"climate" + 0.209*"it!" + 0.208*"panic"',
 u'0.586*"on" + -0.315*"is" + -0.217*"get" + 0.165*"from" + -0.140*"ebola" + -0.129*"i" + -0.126*"google" + -0.122*"it" + -0.117*"really" + 0.114*"liked"']

In [13]:
lda = models.LdaMulticore(tmp_corpus, id2word=tempDict, num_topics=10)

In [21]:
lda.print_topics(4)


Out[21]:
[u'0.013*#youtube + 0.013*rt + 0.012*with + 0.011*#news + 0.011*me + 0.010*follow + 0.009*#pussy + 0.009*#pics + 0.007*apple + 0.007*&lt;3',
 u'0.089*video + 0.086*@youtube + 0.077*i + 0.058*liked + 0.043*from + 0.037*- + 0.015*playlist + 0.014*added + 0.010*via + 0.009*|',
 u'0.033*out + 0.029*rt + 0.027*check + 0.018*my + 0.017*new + 0.017*you + 0.011*mac + 0.010*video + 0.010*go + 0.009*it',
 u'0.032*apple + 0.025*rt + 0.015*- + 0.011*\u2026 + 0.010*is + 0.008*end + 0.007*#apple + 0.006*&amp; + 0.006*up + 0.006*#itunes']

In [33]:
print tempDict


Dictionary(19704 unique tokens: [u'raining', u'@genlakhyenrab:', u'@leonbolier', u'@moviezadda:', u'#istandsunday']...)

In [39]:
tempDict.filter_extremes(no_below = 5, )
print tempDict
tempDict.filter_tokens()
tempDict.compactify()
print(tmp_corpus[0])
tmp_corpus = [tempDict.doc2bow(item) for item in text]
print(tmp_corpus[0])


Dictionary(19704 unique tokens: [u'raining', u'@genlakhyenrab:', u'@leonbolier', u'@moviezadda:', u'#istandsunday']...)
[(357, 3), (1615, 1), (3377, 1), (3580, 1), (4400, 1), (8241, 1), (8442, 1), (15792, 2)]
[(345, 3), (1592, 1), (3383, 1), (3583, 1), (4412, 1), (8257, 1), (8460, 1), (15800, 2)]

In [41]:
lda = models.LdaMulticore(tmp_corpus, id2word=tempDict, num_topics = 5)

In [42]:
print(lda.print_topics(4))


[u'0.049*rt + 0.029*you + 0.016*this + 0.014*my + 0.013*i + 0.012*is + 0.011*have + 0.011*really + 0.011*out + 0.010*youtube', u'0.039*rt + 0.026*ebola + 0.024*- + 0.014*... + 0.013*by + 0.009*&amp; + 0.008*but + 0.008*on + 0.007*me + 0.007*not', u'0.089*@youtube + 0.070*video + 0.063*i + 0.051*- + 0.042*liked + 0.035*via + 0.034*from + 0.011*playlist + 0.010*| + 0.010*added', u'0.035*- + 0.023*apple + 0.022*rt + 0.016*google + 0.012*full + 0.011*on + 0.011*is + 0.010*price + 0.010*ebay: + 0.010*iphone']