In [ ]:
from charts.GanttChart import GanttChart

In [ ]:
CSV_ROUTE = "../../output/stackoverflow/stackoverflow.csv"
chartCreator = GanttChart(CSV_ROUTE)
chartCreator.showCharts()

In [ ]:
from pprint import pprint as pp
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

%matplotlib inline
matplotlib.style.use('seaborn')

In [ ]:
rpd = pd.read_csv("../../output/reddit/reddit.csv")
rpdOK = pd.read_csv("../../output/reddit/reddit_livevars.csv")
rpdOUT = pd.read_csv("../../output/reddit/reddit_outliers.csv")

In [ ]:
rpd.head()

In [ ]:
import datetime
for i in (rpd,rpdOK,rpdOUT):
    i.firstTimestamp = i.firstTimestamp.apply(lambda t : datetime.datetime.fromtimestamp(t).date())
    i.lastTimestamp = i.lastTimestamp.apply(lambda t : datetime.datetime.fromtimestamp(t).date())
    i.variationId = i.variationId.astype('int32')

In [ ]:
rpd.head()

In [ ]:
def calc_date_range(df):
    return pd.date_range(df.firstTimestamp,df.lastTimestamp)

rpd["timespan"] = rpd.apply(calc_date_range,axis=1)

In [ ]:
rpd.groupby(by=["entityName","variationId"])

timespanIndex = pd.date_range(rpd.firstTimestamp.min(),rpd.lastTimestamp.max())

In [ ]:
timespanIndex

In [ ]:
opd = pd.concat(
    [pd.Series(i, q,name = "Comments_{}".format(i)) for i,q in rpd[rpd.entityName == "Comments"]['timespan'].iteritems()],
#    [pd.Series(1,
#                           pd.date_range(rpd.firstTimestamp.min(),rpd.lastTimestamp.max()),
#                           name="Comments_1"
#                          )],
    axis=1)
#opd.set_index(timespanIndex)
opd.head()

In [ ]:
opd.plot(lw=10,figsize=(14,14*2/3))

In [ ]:
list(rpd[rpd.entityName == "Comments"]['timespan'].iteritems())

In [ ]:
rpd.set_index(timespanIndex)

In [ ]: