In [2]:
import json, datetime
def date_hook(json_dict):
for (key, value) in json_dict.items():
try:
json_dict[key] = datetime.datetime.strptime(value,'%a %b %d %H:%M:%S +0000 %Y')
except:
pass
return json_dict
In [2]:
Out[2]:
In [3]:
import sys, glob, errno
datetweets = []
path = '../data/msftGoogAaplTweets.20141102-0[89]*.json'
files = glob.glob(path)
print len(files)
for name in files:
try:
for line in open(name): # No need to specify 'r': this is the default.
datetweets.append(json.loads(line, object_hook=date_hook))
except IOError as exc:
if exc.errno != errno.EISDIR: # Do not fail if a directory is found, just ignore it.
raise # Propag
#for line in open("../data/msftGoogAaplTweets.20141101*.json"):
# try:
# datetweets.append(json.loads(line, object_hook=date_hook))
# except:
# pass
In [4]:
print len(datetweets)
In [5]:
print datetweets[0]['created_at']
print datetweets[0]
print datetweets[len(datetweets)-1]['created_at']
In [6]:
from datetime import datetime
start = datetime(2014, 11, 2, 15,30)
end = datetime(2014, 11, 2, 15, 59)
In [7]:
#for tweet in datetweets:
# if tweet['created_at'] >= start and tweet['created_at'] <= end:
# print tweet['created_at']
tweets = [tweet for tweet in datetweets if tweet['created_at'] >= start and tweet['created_at'] >= end ]
print len(tweets)
In [13]:
tmp = tweets[0]
print tmp['text']
#tweets = [tweet['text'] for tweet in datetweets]
#text = [[word for word in tweet.lower().split() if word not in stopwords]
# for tweet in tweets]
[tmp['test'] word for word in tmp['text'].lower().split() ]
print tmp['test']
In [8]:
stopwords = ['for', 'if','was','a', 'and', 'the', 'of', 'to', 'in']
text = [[ word for word in tweet['text'].lower().split() if word not in stopwords]
for tweet in tweets]
In [15]:
print(text[1:10])
# this takes tome
all_tokens = sum(text,[])
In [16]:
tokens_once_2 = set(word for word in set(all_tokens) if all_tokens.count(word) ==1)
In [17]:
print len(tokens_once_2)
text = [[word for word in tweet if word not in tokens_once_2]
for tweet in text]
#text = [[word for word in tweet.lower().split() if word not in stopwords]
# for tweet in tweets]
print text[0]
In [29]:
from gensim import corpora
from gensim import models
tempDict = corpora.Dictionary(text)
print(tempDict)
tmp_corpus = [tempDict.doc2bow(item) for item in text]
print(tmp_corpus[0])
In [10]:
lsi = models.LsiModel(tmp_corpus, id2word=tempDict, num_topics=10)
In [11]:
lsi.print_topics(10)
Out[11]:
In [13]:
lda = models.LdaMulticore(tmp_corpus, id2word=tempDict, num_topics=10)
In [21]:
lda.print_topics(4)
Out[21]:
In [33]:
print tempDict
In [39]:
tempDict.filter_extremes(no_below = 5, )
print tempDict
tempDict.filter_tokens()
tempDict.compactify()
print(tmp_corpus[0])
tmp_corpus = [tempDict.doc2bow(item) for item in text]
print(tmp_corpus[0])
In [41]:
lda = models.LdaMulticore(tmp_corpus, id2word=tempDict, num_topics = 5)
In [42]:
print(lda.print_topics(4))