notebook.community

Edit and run



In [1]:

    
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LinearRegression
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
from pandas.tools.plotting import scatter_matrix



In [32]:

    
import json
from pprint import pprint

with open('/Users/danielkershaw/PycharmProjects/DiffusionSimulation/data/twitter-geo-test') as data_file:
    for l in data_file:
        data = json.loads(l)
    
df = pd.read_json(data["raw"])
import datetime
def dt(X):
    return datetime.datetime.fromtimestamp(float(X / 1000))
df['time'] = df['time'].apply(dt)
df = df.sort(["time"])
df









    



/Users/danielkershaw/Virtualenvs/DiffusionSimulation/lib/python2.7/site-packages/ipykernel/__main__.py:13: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)






    Out[32]:






  
    
      
      ActivateionExposure
      UserExposure
      community
      degree
      early_spread_time
      index
      inffectedCommunities
      inffectedCommunitiesnor
      node
      numberActivatedUsers
      numberActivatedUsersnorm
      numberOfActivations
      pagerank
      surface
      time
      usageEntorpy
      usagedominance
      userUsageEntorpy
      userusagedominance
    
  
  
    
      0
      0
      0
      0
      360
      0
      2
      1
      0.071429
      G9
      1
      0.000344
      1
      0.000726
      360
      2014-10-15 00:37:23
      0.000000
      1.000000
      0.000000
      1.000000
    
    
      1
      1
      1
      9
      564
      501495000
      3
      2
      0.142857
      BT2
      2
      0.000687
      2
      0.002498
      781
      2014-10-20 19:55:38
      0.693147
      0.500000
      0.693147
      0.500000
    
    
      2
      0
      0
      2
      68
      501705000
      4
      3
      0.214286
      CA22
      3
      0.001031
      3
      0.000133
      814
      2014-10-20 19:59:08
      1.098612
      0.333333
      1.098612
      0.333333
    
    
      3
      0
      0
      2
      343
      854916000
      5
      3
      0.214286
      BL1
      4
      0.001375
      4
      0.000363
      1000
      2014-10-24 22:05:59
      1.039721
      0.500000
      1.039721
      0.500000
    
    
      4
      0
      0
      4
      437
      1194094000
      6
      4
      0.285714
      PO6
      5
      0.001718
      5
      0.000569
      1271
      2014-10-28 19:18:57
      1.332179
      0.400000
      1.332179
      0.400000
    
    
      5
      2
      2
      0
      295
      1207783000
      7
      4
      0.285714
      KY99
      6
      0.002062
      6
      0.000779
      1320
      2014-10-28 23:07:06
      1.329661
      0.333333
      1.329661
      0.333333
    
    
      6
      2
      2
      0
      295
      1207814000
      8
      4
      0.285714
      KY99
      6
      0.002062
      7
      0.000779
      1320
      2014-10-28 23:07:37
      1.277034
      0.428571
      1.329661
      0.333333
    
    
      7
      1
      1
      9
      349
      1211950000
      9
      4
      0.285714
      BT80
      7
      0.002405
      8
      0.000801
      1371
      2014-10-29 00:16:33
      1.320888
      0.375000
      1.351784
      0.285714
    
    
      8
      0
      0
      2
      137
      1217039000
      10
      4
      0.285714
      CH27
      8
      0.002749
      9
      0.000183
      1427
      2014-10-29 01:41:22
      1.310784
      0.333333
      1.320888
      0.375000
    
    
      9
      3
      3
      0
      296
      1251249000
      11
      4
      0.285714
      KA1
      9
      0.003093
      10
      0.000852
      1451
      2014-10-29 11:11:32
      1.279854
      0.400000
      1.310784
      0.333333
    
    
      10
      0
      0
      4
      437
      1340709000
      12
      4
      0.285714
      PO6
      9
      0.003093
      11
      0.000569
      1451
      2014-10-30 12:02:32
      1.342113
      0.363636
      1.310784
      0.333333
    
    
      11
      2
      2
      0
      295
      1379927000
      13
      4
      0.285714
      KY99
      9
      0.003093
      12
      0.000779
      1451
      2014-10-30 22:56:10
      1.308605
      0.416667
      1.310784
      0.333333
    
    
      12
      2
      1
      7
      326
      1381213000
      14
      5
      0.357143
      CT10
      10
      0.003436
      13
      0.000330
      1546
      2014-10-30 23:17:36
      1.479133
      0.384615
      1.504788
      0.300000
    
    
      13
      2
      1
      3
      260
      1382018000
      15
      6
      0.428571
      SN99
      11
      0.003780
      14
      0.000304
      1628
      2014-10-30 23:31:01
      1.630799
      0.357143
      1.672625
      0.272727
    
    
      14
      1
      1
      2
      320
      1382979000
      16
      6
      0.428571
      OL16
      12
      0.004124
      15
      0.000250
      1718
      2014-10-30 23:47:02
      1.617053
      0.333333
      1.632631
      0.333333
    
    
      15
      3
      2
      7
      527
      1383075000
      17
      6
      0.428571
      RH13
      13
      0.004467
      16
      0.000422
      1842
      2014-10-30 23:48:38
      1.663136
      0.312500
      1.671595
      0.307692
    
    
      16
      1
      1
      7
      283
      1972660000
      18
      6
      0.428571
      TN23
      14
      0.004811
      17
      0.000242
      1880
      2014-11-06 19:35:03
      1.676696
      0.294118
      1.673118
      0.285714
    
    
      17
      10
      8
      5
      1130
      18045859000
      19
      7
      0.500000
      DG1
      15
      0.005155
      18
      0.000839
      2070
      2015-05-11 21:21:42
      1.798106
      0.277778
      1.806507
      0.266667



In [41]:

    
dt = df.set_index(pd.DatetimeIndex(df['time']))
dt = dt.resample('d').max()
idx = pd.date_range(dt.index[0], dt.index[0] + datetime.timedelta(days=30))
dt.reindex(idx, fill_value=0, method='ffill').fillna(method='ffill')["numberActivatedUsers"]









    Out[41]:





2014-10-15     1.0
2014-10-16     1.0
2014-10-17     1.0
2014-10-18     1.0
2014-10-19     1.0
2014-10-20     3.0
2014-10-21     3.0
2014-10-22     3.0
2014-10-23     3.0
2014-10-24     4.0
2014-10-25     4.0
2014-10-26     4.0
2014-10-27     4.0
2014-10-28     7.0
2014-10-29    10.0
2014-10-30    16.0
2014-10-31    16.0
2014-11-01    16.0
2014-11-02    16.0
2014-11-03    16.0
2014-11-04    16.0
2014-11-05    16.0
2014-11-06    17.0
2014-11-07    17.0
2014-11-08    17.0
2014-11-09    17.0
2014-11-10    17.0
2014-11-11    17.0
2014-11-12    17.0
2014-11-13    17.0
2014-11-14    17.0
Freq: D, Name: numberOfActivations, dtype: float64



In [42]:

    
dt.reindex(idx, fill_value=0, method='ffill').fillna(method='ffill')["numberOfActivations"]









    Out[42]:





2014-10-15     1.0
2014-10-16     1.0
2014-10-17     1.0
2014-10-18     1.0
2014-10-19     1.0
2014-10-20     3.0
2014-10-21     3.0
2014-10-22     3.0
2014-10-23     3.0
2014-10-24     4.0
2014-10-25     4.0
2014-10-26     4.0
2014-10-27     4.0
2014-10-28     7.0
2014-10-29    10.0
2014-10-30    16.0
2014-10-31    16.0
2014-11-01    16.0
2014-11-02    16.0
2014-11-03    16.0
2014-11-04    16.0
2014-11-05    16.0
2014-11-06    17.0
2014-11-07    17.0
2014-11-08    17.0
2014-11-09    17.0
2014-11-10    17.0
2014-11-11    17.0
2014-11-12    17.0
2014-11-13    17.0
2014-11-14    17.0
Freq: D, Name: numberOfActivations, dtype: float64



In [180]:

    
import datetime
def dt(X):
    return datetime.datetime.fromtimestamp(float(X/1000))
df['time'] = df['time'].apply(dt)



In [181]:

    
df.head()









    Out[181]:






  
    
      
      ActivateionExposure
      UserExposure
      community
      degree
      early_spread_time
      index
      inffectedCommunities
      inffectedCommunitiesnor
      node
      numberActivatedUsers
      numberActivatedUsersnorm
      numberOfActivations
      pagerank
      surface
      time
      usageEntorpy
      usagedominance
      userUsageEntorpy
      userusagedominance
    
  
  
    
      0
      0
      0
      7
      422
      0
      2
      1
      0.071429
      RH16
      1
      0.000344
      1
      0.000284
      422
      2014-10-29 17:40:19
      0.000000
      1.000000
      0.000000
      1.000000
    
    
      1
      1
      1
      8
      1599
      2317350000
      3
      2
      0.142857
      CB8
      2
      0.000687
      2
      0.003956
      1666
      2014-11-25 13:22:49
      0.693147
      0.500000
      0.693147
      0.500000
    
    
      2
      2
      2
      9
      734
      2319847000
      4
      3
      0.214286
      BT92
      3
      0.001031
      3
      0.002070
      1844
      2014-11-25 14:04:26
      1.098612
      0.333333
      1.098612
      0.333333
    
    
      3
      1
      1
      8
      276
      2743060000
      5
      3
      0.214286
      CB9
      4
      0.001375
      4
      0.000189
      1864
      2014-11-30 11:37:59
      1.039721
      0.500000
      1.039721
      0.500000
    
    
      4
      1
      1
      12
      325
      4058551000
      6
      4
      0.285714
      DH99
      5
      0.001718
      5
      0.000400
      1930
      2014-12-15 17:02:50
      1.332179
      0.400000
      1.332179
      0.400000



In [198]:

    
def to_date(X):
    return X.day()


dft = df.set_index(pd.DatetimeIndex(df['time']))
start = dft.index.searchsorted(dft.index[0])
end = dft.index.searchsorted(dft.index[0] + datetime.timedelta(days=30))
dft = dft.ix[start:end]
dftt = pd.DataFrame(index = dft.index)
dftt["activations"] = 1
# idx = pd.date_range(dft.index[0].date(), dft.index[0].date() + datetime.timedelta(days=30), freq='1D')
# dft.reindex(idx, fill_value=0)
dftt = dftt.resample('d',how='sum').fillna(0)
dftt["activations"] = (dftt["activations"].cumsum()/dftt["activations"].sum())









    



/Users/danielkershaw/Virtualenvs/DiffusionSimulation/lib/python2.7/site-packages/ipykernel/__main__.py:11: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...).sum()



In [199]:

    
dftt.reset_index()









    Out[199]:






  
    
      
      index
      activations
    
  
  
    
      0
      2014-10-29
      0.333333
    
    
      1
      2014-10-30
      0.333333
    
    
      2
      2014-10-31
      0.333333
    
    
      3
      2014-11-01
      0.333333
    
    
      4
      2014-11-02
      0.333333
    
    
      5
      2014-11-03
      0.333333
    
    
      6
      2014-11-04
      0.333333
    
    
      7
      2014-11-05
      0.333333
    
    
      8
      2014-11-06
      0.333333
    
    
      9
      2014-11-07
      0.333333
    
    
      10
      2014-11-08
      0.333333
    
    
      11
      2014-11-09
      0.333333
    
    
      12
      2014-11-10
      0.333333
    
    
      13
      2014-11-11
      0.333333
    
    
      14
      2014-11-12
      0.333333
    
    
      15
      2014-11-13
      0.333333
    
    
      16
      2014-11-14
      0.333333
    
    
      17
      2014-11-15
      0.333333
    
    
      18
      2014-11-16
      0.333333
    
    
      19
      2014-11-17
      0.333333
    
    
      20
      2014-11-18
      0.333333
    
    
      21
      2014-11-19
      0.333333
    
    
      22
      2014-11-20
      0.333333
    
    
      23
      2014-11-21
      0.333333
    
    
      24
      2014-11-22
      0.333333
    
    
      25
      2014-11-23
      0.333333
    
    
      26
      2014-11-24
      0.333333
    
    
      27
      2014-11-25
      1.000000



In [187]:

    
y = dftt["activations"]
x = np.arange(0, len(dftt["activations"]))



In [188]:

    
dftt["activations"].mean()









    Out[188]:





0.35714285714285721



In [189]:

    
axes = plt.plot(x, y, 'o');
plt.axis((0,30,0,1))









    Out[189]:





(0, 30, 0, 1)



In [ ]:

	ActivateionExposure	UserExposure	community	degree	early_spread_time	index	inffectedCommunities	inffectedCommunitiesnor	node	numberActivatedUsers	numberActivatedUsersnorm	numberOfActivations	pagerank	surface	time	usageEntorpy	usagedominance	userUsageEntorpy	userusagedominance
0	0	0	0	360	0	2	1	0.071429	G9	1	0.000344	1	0.000726	360	2014-10-15 00:37:23	0.000000	1.000000	0.000000	1.000000
1	1	1	9	564	501495000	3	2	0.142857	BT2	2	0.000687	2	0.002498	781	2014-10-20 19:55:38	0.693147	0.500000	0.693147	0.500000
2	0	0	2	68	501705000	4	3	0.214286	CA22	3	0.001031	3	0.000133	814	2014-10-20 19:59:08	1.098612	0.333333	1.098612	0.333333
3	0	0	2	343	854916000	5	3	0.214286	BL1	4	0.001375	4	0.000363	1000	2014-10-24 22:05:59	1.039721	0.500000	1.039721	0.500000
4	0	0	4	437	1194094000	6	4	0.285714	PO6	5	0.001718	5	0.000569	1271	2014-10-28 19:18:57	1.332179	0.400000	1.332179	0.400000
5	2	2	0	295	1207783000	7	4	0.285714	KY99	6	0.002062	6	0.000779	1320	2014-10-28 23:07:06	1.329661	0.333333	1.329661	0.333333
6	2	2	0	295	1207814000	8	4	0.285714	KY99	6	0.002062	7	0.000779	1320	2014-10-28 23:07:37	1.277034	0.428571	1.329661	0.333333
7	1	1	9	349	1211950000	9	4	0.285714	BT80	7	0.002405	8	0.000801	1371	2014-10-29 00:16:33	1.320888	0.375000	1.351784	0.285714
8	0	0	2	137	1217039000	10	4	0.285714	CH27	8	0.002749	9	0.000183	1427	2014-10-29 01:41:22	1.310784	0.333333	1.320888	0.375000
9	3	3	0	296	1251249000	11	4	0.285714	KA1	9	0.003093	10	0.000852	1451	2014-10-29 11:11:32	1.279854	0.400000	1.310784	0.333333
10	0	0	4	437	1340709000	12	4	0.285714	PO6	9	0.003093	11	0.000569	1451	2014-10-30 12:02:32	1.342113	0.363636	1.310784	0.333333
11	2	2	0	295	1379927000	13	4	0.285714	KY99	9	0.003093	12	0.000779	1451	2014-10-30 22:56:10	1.308605	0.416667	1.310784	0.333333
12	2	1	7	326	1381213000	14	5	0.357143	CT10	10	0.003436	13	0.000330	1546	2014-10-30 23:17:36	1.479133	0.384615	1.504788	0.300000
13	2	1	3	260	1382018000	15	6	0.428571	SN99	11	0.003780	14	0.000304	1628	2014-10-30 23:31:01	1.630799	0.357143	1.672625	0.272727
14	1	1	2	320	1382979000	16	6	0.428571	OL16	12	0.004124	15	0.000250	1718	2014-10-30 23:47:02	1.617053	0.333333	1.632631	0.333333
15	3	2	7	527	1383075000	17	6	0.428571	RH13	13	0.004467	16	0.000422	1842	2014-10-30 23:48:38	1.663136	0.312500	1.671595	0.307692
16	1	1	7	283	1972660000	18	6	0.428571	TN23	14	0.004811	17	0.000242	1880	2014-11-06 19:35:03	1.676696	0.294118	1.673118	0.285714
17	10	8	5	1130	18045859000	19	7	0.500000	DG1	15	0.005155	18	0.000839	2070	2015-05-11 21:21:42	1.798106	0.277778	1.806507	0.266667

	ActivateionExposure	UserExposure	community	degree	early_spread_time	index	inffectedCommunities	inffectedCommunitiesnor	node	numberActivatedUsers	numberActivatedUsersnorm	numberOfActivations	pagerank	surface	time	usageEntorpy	usagedominance	userUsageEntorpy	userusagedominance
0	0	0	7	422	0	2	1	0.071429	RH16	1	0.000344	1	0.000284	422	2014-10-29 17:40:19	0.000000	1.000000	0.000000	1.000000
1	1	1	8	1599	2317350000	3	2	0.142857	CB8	2	0.000687	2	0.003956	1666	2014-11-25 13:22:49	0.693147	0.500000	0.693147	0.500000
2	2	2	9	734	2319847000	4	3	0.214286	BT92	3	0.001031	3	0.002070	1844	2014-11-25 14:04:26	1.098612	0.333333	1.098612	0.333333
3	1	1	8	276	2743060000	5	3	0.214286	CB9	4	0.001375	4	0.000189	1864	2014-11-30 11:37:59	1.039721	0.500000	1.039721	0.500000
4	1	1	12	325	4058551000	6	4	0.285714	DH99	5	0.001718	5	0.000400	1930	2014-12-15 17:02:50	1.332179	0.400000	1.332179	0.400000

	index	activations
0	2014-10-29	0.333333
1	2014-10-30	0.333333
2	2014-10-31	0.333333
3	2014-11-01	0.333333
4	2014-11-02	0.333333
5	2014-11-03	0.333333
6	2014-11-04	0.333333
7	2014-11-05	0.333333
8	2014-11-06	0.333333
9	2014-11-07	0.333333
10	2014-11-08	0.333333
11	2014-11-09	0.333333
12	2014-11-10	0.333333
13	2014-11-11	0.333333
14	2014-11-12	0.333333
15	2014-11-13	0.333333
16	2014-11-14	0.333333
17	2014-11-15	0.333333
18	2014-11-16	0.333333
19	2014-11-17	0.333333
20	2014-11-18	0.333333
21	2014-11-19	0.333333
22	2014-11-20	0.333333
23	2014-11-21	0.333333
24	2014-11-22	0.333333
25	2014-11-23	0.333333
26	2014-11-24	0.333333
27	2014-11-25	1.000000