In [1]:
import json, datetime
def date_hook(json_dict):
for (key, value) in json_dict.items():
try:
json_dict[key] = datetime.datetime.strptime(value,'%a %b %d %H:%M:%S +0000 %Y')
except:
pass
return json_dict
import sys, glob, errno
datetweets = []
path = '../data/someTweets.20141013-013654.json'
files = glob.glob(path)
print len(files)
for name in files:
try:
for line in open(name): # No need to specify 'r': this is the default.
datetweets.append(json.loads(line, object_hook=date_hook))
except IOError as exc:
if exc.errno != errno.EISDIR: # Do not fail if a directory is found, just ignore it.
raise # Propag
In [2]:
len(datetweets)
Out[2]:
In [2]:
from os import path
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import os
os.path.exists('/Users/paul/Library/Fonts/Verdana.ttf')
font_path = '/Users/carlyhendrickson/Library/Fonts/SourceCodePro-Light.otf'
text = [tweet['text'] for tweet in datetweets]
text = str(text)
In [18]:
wordcloud = WordCloud(font_path).generate(text)
wordcloud
# Open a plot of the generated image.
In [20]:
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [1]:
import csv,codecs,cStringIO
class UTF8Recoder:
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
'''next() -> unicode
This function reads and returns the next line as a Unicode string.
'''
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
'''writerow(unicode) -> None
This function takes a Unicode string and encodes it to the output.
'''
self.writer.writerow([s.encode("utf-8") for s in row])
data = self.queue.getvalue()
data = data.decode("utf-8")
data = self.encoder.encode(data)
self.stream.write(data)
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
In [2]:
sentiment = []
import csv
#with open('tweet_sentiment_scores.csv', 'rb') as csvfile:
# f = csv.reader(csvfile, delimiter=',', quotechar='"')
# for row in f:
# sentiment.append(row)
sentimenet = []
with open('tweet_sentiment_scores.csv','rb') as fin:
reader = UnicodeReader(fin, quoting=csv.QUOTE_ALL)
for line in reader:
sentiment.append(line)
In [9]:
import gensim
#print sentiment[20][2]
s = sentiment[20][2]
s = s.split(",")
tokens = list(gensim.utils.tokenize(sentiment[0][2]))
print tokens
In [149]:
text = [list(gensim.utils.tokenize(block[2].decode('unicode_escape').encode('ascii','ignore'))) for block in sentiment ]
len(text)
Out[149]:
In [152]:
#print text[2]
print type(text)
print type(text[2][1])
stopwords = [u'u,',u'facebook', u'm', u'youtube']
text = [ word for word in text if word not in stopwords ]
print type(text)
print type(text[2][1])
t = text[0]
print t
type(t)
Out[152]:
In [155]:
#text = [block[2].decode('unicode_escape').encode('ascii','ignore') for block in sentiment ]
#len(text)
def filter_words(text):
text = [[t.replace("u'","") for t in t] for t in text]
text = [t.replace("|","") for t in text]
text = [t.replace("',","") for t in text]
text = [t.replace("amp","") for t in text]
text = [t.replace(";","") for t in text]
text = [t.replace("u\"","") for t in text]
text = [t.replace("\"","") for t in text]
text = [t.replace("\'","") for t in text]
text = [t.replace(",'","") for t in text]
text = [t.replace("tco,'","") for t in text]
text = [t.replace("t ,'","") for t in text]
text = [t.replace("tco,'","") for t in text]
text = [t.replace("youtube,'","") for t in text]
text = [t.replace("apple,'","") for t in text]
text = [t.replace("facebook'","") for t in text]
text = [t.replace("httpco'","") for t in text]
text = [t.replace("m'","") for t in text]
text = [[filter_words(t) for t in text]
#text = [t.replace(" ","") for t in text]
print text[200]
In [130]:
text = [t.replace(" ","") for t in text]
In [132]:
In [144]:
len(text)
Out[144]:
In [68]:
In [72]:
In [73]:
Out[73]:
In [ ]:
In [56]:
from wordcloud import WordCloud
from os import path
font_path = '/Users/carlyhendrickson/Library/Fonts/SourceCodePro-Light.otf'
import matplotlib.pyplot as plt
In [148]:
wordcloud = WordCloud(font_path, width=700, height=700).generate(str(text[200]))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [50]:
In [87]:
len(text)
Out[87]:
In [ ]:
In [82]:
i = 1000
for topic in text:
i = i + 1
wordcloud = WordCloud(font_path, width=700, height=700).generate(topic)
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('foo'+str(i)+'.png', bbox_inches='tight')