In [1]:
import pandas as pd
import json,csv,re,os,sys,glob
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
import numpy as np
In [55]:
times=[]
content=[]
topics=[]
for f in glob.glob('../data/2014-06/Data*json')[0:50]:
for line in open(f,'r').read().decode('utf-8').split('\n'):
tweet=json.loads(line)
try:
times.append(tweet['interaction']['created_at'])
except:
times.append('NaN')
try:
content.append(tweet['interaction']['content'].encode('utf-8'))
except:
content.append('NaN')
try:
topics.append(tweet['interaction']['tag_tree']['topic'][0])
except:
topics.append('NaN')
In [5]:
outFile=csv.writer(open('times_file.csv','w'),delimiter='\t')
for a,b,c in zip(times,content,topics):
outFile.writerow([a,b,c])
In [6]:
df=pd.read_csv('times_file.csv',parse_dates=True,header=False,index_col=0,delimiter='\t',names=['content','topics'])
In [91]:
series=pd.Series(data=topics,index=times)
#df=pd.DataFrame(data={'topic':topics},index=pd.to_datetime(times))
In [7]:
df.head()
Out[7]:
In [41]:
for a,b in df.groupby('topics'):
b.resample('D',how='count')['content'].plot(label=a,legend=True,figsize=(20,10),logy=False)
# print type(b)
In [13]:
b.resample('D',how='count')['content']
type(b.resample('D',how='count')['content'])
Out[13]:
In [17]:
b.resample('D',how='count')['content']+b.resample('D',how='count')['content']
#print '----------'
#b.resample('D',how='count')['content']
Out[17]:
In [39]:
bSeries=b.resample('D',how='count')['content']
#bSeries.combine(bSeries,func=lambda x,y:x+y,fill_value=0)
bSeries.add(bSeries)
#bSeries
Out[39]:
In [25]:
?bSeries.combine
In [44]:
a=df.groupby('topics')
In [53]:
pd.set_option('display.max_columns',1000)
pd.set_option('display.width',1000)
pd.set_option('display.max_colwidth',800)
In [52]:
?pd.set_option
In [54]:
a.describe()
Out[54]:
In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)
Out[1]:
In [ ]: