In [1]:
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LinearRegression
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
from pandas.tools.plotting import scatter_matrix

In [32]:
import json
from pprint import pprint

with open('/Users/danielkershaw/PycharmProjects/DiffusionSimulation/data/twitter-geo-test') as data_file:
    for l in data_file:
        data = json.loads(l)
    
df = pd.read_json(data["raw"])
import datetime
def dt(X):
    return datetime.datetime.fromtimestamp(float(X / 1000))
df['time'] = df['time'].apply(dt)
df = df.sort(["time"])
df


/Users/danielkershaw/Virtualenvs/DiffusionSimulation/lib/python2.7/site-packages/ipykernel/__main__.py:13: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
Out[32]:
ActivateionExposure UserExposure community degree early_spread_time index inffectedCommunities inffectedCommunitiesnor node numberActivatedUsers numberActivatedUsersnorm numberOfActivations pagerank surface time usageEntorpy usagedominance userUsageEntorpy userusagedominance
0 0 0 0 360 0 2 1 0.071429 G9 1 0.000344 1 0.000726 360 2014-10-15 00:37:23 0.000000 1.000000 0.000000 1.000000
1 1 1 9 564 501495000 3 2 0.142857 BT2 2 0.000687 2 0.002498 781 2014-10-20 19:55:38 0.693147 0.500000 0.693147 0.500000
2 0 0 2 68 501705000 4 3 0.214286 CA22 3 0.001031 3 0.000133 814 2014-10-20 19:59:08 1.098612 0.333333 1.098612 0.333333
3 0 0 2 343 854916000 5 3 0.214286 BL1 4 0.001375 4 0.000363 1000 2014-10-24 22:05:59 1.039721 0.500000 1.039721 0.500000
4 0 0 4 437 1194094000 6 4 0.285714 PO6 5 0.001718 5 0.000569 1271 2014-10-28 19:18:57 1.332179 0.400000 1.332179 0.400000
5 2 2 0 295 1207783000 7 4 0.285714 KY99 6 0.002062 6 0.000779 1320 2014-10-28 23:07:06 1.329661 0.333333 1.329661 0.333333
6 2 2 0 295 1207814000 8 4 0.285714 KY99 6 0.002062 7 0.000779 1320 2014-10-28 23:07:37 1.277034 0.428571 1.329661 0.333333
7 1 1 9 349 1211950000 9 4 0.285714 BT80 7 0.002405 8 0.000801 1371 2014-10-29 00:16:33 1.320888 0.375000 1.351784 0.285714
8 0 0 2 137 1217039000 10 4 0.285714 CH27 8 0.002749 9 0.000183 1427 2014-10-29 01:41:22 1.310784 0.333333 1.320888 0.375000
9 3 3 0 296 1251249000 11 4 0.285714 KA1 9 0.003093 10 0.000852 1451 2014-10-29 11:11:32 1.279854 0.400000 1.310784 0.333333
10 0 0 4 437 1340709000 12 4 0.285714 PO6 9 0.003093 11 0.000569 1451 2014-10-30 12:02:32 1.342113 0.363636 1.310784 0.333333
11 2 2 0 295 1379927000 13 4 0.285714 KY99 9 0.003093 12 0.000779 1451 2014-10-30 22:56:10 1.308605 0.416667 1.310784 0.333333
12 2 1 7 326 1381213000 14 5 0.357143 CT10 10 0.003436 13 0.000330 1546 2014-10-30 23:17:36 1.479133 0.384615 1.504788 0.300000
13 2 1 3 260 1382018000 15 6 0.428571 SN99 11 0.003780 14 0.000304 1628 2014-10-30 23:31:01 1.630799 0.357143 1.672625 0.272727
14 1 1 2 320 1382979000 16 6 0.428571 OL16 12 0.004124 15 0.000250 1718 2014-10-30 23:47:02 1.617053 0.333333 1.632631 0.333333
15 3 2 7 527 1383075000 17 6 0.428571 RH13 13 0.004467 16 0.000422 1842 2014-10-30 23:48:38 1.663136 0.312500 1.671595 0.307692
16 1 1 7 283 1972660000 18 6 0.428571 TN23 14 0.004811 17 0.000242 1880 2014-11-06 19:35:03 1.676696 0.294118 1.673118 0.285714
17 10 8 5 1130 18045859000 19 7 0.500000 DG1 15 0.005155 18 0.000839 2070 2015-05-11 21:21:42 1.798106 0.277778 1.806507 0.266667

In [41]:
dt = df.set_index(pd.DatetimeIndex(df['time']))
dt = dt.resample('d').max()
idx = pd.date_range(dt.index[0], dt.index[0] + datetime.timedelta(days=30))
dt.reindex(idx, fill_value=0, method='ffill').fillna(method='ffill')["numberActivatedUsers"]


Out[41]:
2014-10-15     1.0
2014-10-16     1.0
2014-10-17     1.0
2014-10-18     1.0
2014-10-19     1.0
2014-10-20     3.0
2014-10-21     3.0
2014-10-22     3.0
2014-10-23     3.0
2014-10-24     4.0
2014-10-25     4.0
2014-10-26     4.0
2014-10-27     4.0
2014-10-28     7.0
2014-10-29    10.0
2014-10-30    16.0
2014-10-31    16.0
2014-11-01    16.0
2014-11-02    16.0
2014-11-03    16.0
2014-11-04    16.0
2014-11-05    16.0
2014-11-06    17.0
2014-11-07    17.0
2014-11-08    17.0
2014-11-09    17.0
2014-11-10    17.0
2014-11-11    17.0
2014-11-12    17.0
2014-11-13    17.0
2014-11-14    17.0
Freq: D, Name: numberOfActivations, dtype: float64

In [42]:
dt.reindex(idx, fill_value=0, method='ffill').fillna(method='ffill')["numberOfActivations"]


Out[42]:
2014-10-15     1.0
2014-10-16     1.0
2014-10-17     1.0
2014-10-18     1.0
2014-10-19     1.0
2014-10-20     3.0
2014-10-21     3.0
2014-10-22     3.0
2014-10-23     3.0
2014-10-24     4.0
2014-10-25     4.0
2014-10-26     4.0
2014-10-27     4.0
2014-10-28     7.0
2014-10-29    10.0
2014-10-30    16.0
2014-10-31    16.0
2014-11-01    16.0
2014-11-02    16.0
2014-11-03    16.0
2014-11-04    16.0
2014-11-05    16.0
2014-11-06    17.0
2014-11-07    17.0
2014-11-08    17.0
2014-11-09    17.0
2014-11-10    17.0
2014-11-11    17.0
2014-11-12    17.0
2014-11-13    17.0
2014-11-14    17.0
Freq: D, Name: numberOfActivations, dtype: float64

In [180]:
import datetime
def dt(X):
    return datetime.datetime.fromtimestamp(float(X/1000))
df['time'] = df['time'].apply(dt)

In [181]:
df.head()


Out[181]:
ActivateionExposure UserExposure community degree early_spread_time index inffectedCommunities inffectedCommunitiesnor node numberActivatedUsers numberActivatedUsersnorm numberOfActivations pagerank surface time usageEntorpy usagedominance userUsageEntorpy userusagedominance
0 0 0 7 422 0 2 1 0.071429 RH16 1 0.000344 1 0.000284 422 2014-10-29 17:40:19 0.000000 1.000000 0.000000 1.000000
1 1 1 8 1599 2317350000 3 2 0.142857 CB8 2 0.000687 2 0.003956 1666 2014-11-25 13:22:49 0.693147 0.500000 0.693147 0.500000
2 2 2 9 734 2319847000 4 3 0.214286 BT92 3 0.001031 3 0.002070 1844 2014-11-25 14:04:26 1.098612 0.333333 1.098612 0.333333
3 1 1 8 276 2743060000 5 3 0.214286 CB9 4 0.001375 4 0.000189 1864 2014-11-30 11:37:59 1.039721 0.500000 1.039721 0.500000
4 1 1 12 325 4058551000 6 4 0.285714 DH99 5 0.001718 5 0.000400 1930 2014-12-15 17:02:50 1.332179 0.400000 1.332179 0.400000

In [198]:
def to_date(X):
    return X.day()


dft = df.set_index(pd.DatetimeIndex(df['time']))
start = dft.index.searchsorted(dft.index[0])
end = dft.index.searchsorted(dft.index[0] + datetime.timedelta(days=30))
dft = dft.ix[start:end]
dftt = pd.DataFrame(index = dft.index)
dftt["activations"] = 1
# idx = pd.date_range(dft.index[0].date(), dft.index[0].date() + datetime.timedelta(days=30), freq='1D')
# dft.reindex(idx, fill_value=0)
dftt = dftt.resample('d',how='sum').fillna(0)
dftt["activations"] = (dftt["activations"].cumsum()/dftt["activations"].sum())


/Users/danielkershaw/Virtualenvs/DiffusionSimulation/lib/python2.7/site-packages/ipykernel/__main__.py:11: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...).sum()

In [199]:
dftt.reset_index()


Out[199]:
index activations
0 2014-10-29 0.333333
1 2014-10-30 0.333333
2 2014-10-31 0.333333
3 2014-11-01 0.333333
4 2014-11-02 0.333333
5 2014-11-03 0.333333
6 2014-11-04 0.333333
7 2014-11-05 0.333333
8 2014-11-06 0.333333
9 2014-11-07 0.333333
10 2014-11-08 0.333333
11 2014-11-09 0.333333
12 2014-11-10 0.333333
13 2014-11-11 0.333333
14 2014-11-12 0.333333
15 2014-11-13 0.333333
16 2014-11-14 0.333333
17 2014-11-15 0.333333
18 2014-11-16 0.333333
19 2014-11-17 0.333333
20 2014-11-18 0.333333
21 2014-11-19 0.333333
22 2014-11-20 0.333333
23 2014-11-21 0.333333
24 2014-11-22 0.333333
25 2014-11-23 0.333333
26 2014-11-24 0.333333
27 2014-11-25 1.000000

In [187]:
y = dftt["activations"]
x = np.arange(0, len(dftt["activations"]))

In [188]:
dftt["activations"].mean()


Out[188]:
0.35714285714285721

In [189]:
axes = plt.plot(x, y, 'o');
plt.axis((0,30,0,1))


Out[189]:
(0, 30, 0, 1)

In [ ]: