In [ ]:
from charts.GanttChart import GanttChart
In [ ]:
CSV_ROUTE = "../../output/stackoverflow/stackoverflow.csv"
chartCreator = GanttChart(CSV_ROUTE)
chartCreator.showCharts()
In [ ]:
from pprint import pprint as pp
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('seaborn')
In [ ]:
rpd = pd.read_csv("../../output/reddit/reddit.csv")
rpdOK = pd.read_csv("../../output/reddit/reddit_livevars.csv")
rpdOUT = pd.read_csv("../../output/reddit/reddit_outliers.csv")
In [ ]:
rpd.head()
In [ ]:
import datetime
for i in (rpd,rpdOK,rpdOUT):
i.firstTimestamp = i.firstTimestamp.apply(lambda t : datetime.datetime.fromtimestamp(t).date())
i.lastTimestamp = i.lastTimestamp.apply(lambda t : datetime.datetime.fromtimestamp(t).date())
i.variationId = i.variationId.astype('int32')
In [ ]:
rpd.head()
In [ ]:
def calc_date_range(df):
return pd.date_range(df.firstTimestamp,df.lastTimestamp)
rpd["timespan"] = rpd.apply(calc_date_range,axis=1)
In [ ]:
rpd.groupby(by=["entityName","variationId"])
timespanIndex = pd.date_range(rpd.firstTimestamp.min(),rpd.lastTimestamp.max())
In [ ]:
timespanIndex
In [ ]:
opd = pd.concat(
[pd.Series(i, q,name = "Comments_{}".format(i)) for i,q in rpd[rpd.entityName == "Comments"]['timespan'].iteritems()],
# [pd.Series(1,
# pd.date_range(rpd.firstTimestamp.min(),rpd.lastTimestamp.max()),
# name="Comments_1"
# )],
axis=1)
#opd.set_index(timespanIndex)
opd.head()
In [ ]:
opd.plot(lw=10,figsize=(14,14*2/3))
In [ ]:
list(rpd[rpd.entityName == "Comments"]['timespan'].iteritems())
In [ ]:
rpd.set_index(timespanIndex)
In [ ]: