In [6]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

Let's import sqlalchemy, pandas and numpy


In [1]:
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine 
import pandas as pd
from pandas.io import sql
import datetime as dt
import numpy as np

In [2]:
print "numpy: ", np.__version__
print "pandas: ", pd.__version__
print "sqlalchamy: ", sqlalchemy.__version__
print "psycopg2: ", psycopg2.__version__


numpy:  1.9.2
pandas:  0.16.0
sqlalchamy:  0.9.9
psycopg2:  2.6 (dt dec pq3 ext lo64)

In [3]:
# connection to postgresql database
#conn = psycopg2.connect("dbname=test user=postgres")
engine = create_engine('postgresql://postgres:password@localhost:5433/cdrstats-billing')

In [4]:
result = engine.execute("select * from voip_switch")
for row in result:
    print "voip_switch:", row['name'], row['ipaddress']


voip_switch: localhost 127.0.0.1
voip_switch: superhost 128.128.0.0

In [5]:
df = sql.read_sql("""SELECT
    dateday,
    switch_id,
    coalesce(nbcalls,0) AS nbcalls,
    coalesce(duration,0) AS duration,
    coalesce(billsec,0) AS billsec,
    coalesce(buy_cost,0) AS buy_cost,
    coalesce(sell_cost,0) AS sell_cost
FROM
    generate_series(
                    date_trunc('hour', current_timestamp - interval '48' hour),
                    date_trunc('hour', current_timestamp + interval '2' hour),
                    '1 hour')
    as dateday
LEFT OUTER JOIN (
    SELECT
        date_trunc('hour', starting_date) as dayhour,
        switch_id as switch_id,
        SUM(nbcalls) as nbcalls,
        SUM(duration) as duration,
        SUM(billsec) as billsec,
        SUM(buy_cost) as buy_cost,
        SUM(sell_cost) as sell_cost
    FROM matv_voip_cdr_aggr_hour
    WHERE
        starting_date > date_trunc('hour', current_timestamp - interval '48' hour) and
        starting_date <= date_trunc('hour', current_timestamp + interval '2' hour)
        
    GROUP BY dayhour, switch_id
    ) results
ON (dateday = results.dayhour)""", engine)
# index_col=["dateday", "switch_id"]
df.head()


Out[5]:
dateday switch_id nbcalls duration billsec buy_cost sell_cost
0 2015-03-29 16:00:00+02:00 NaN 0 0 0 0.00000 0.00000
1 2015-03-29 17:00:00+02:00 NaN 0 0 0 0.00000 0.00000
2 2015-03-29 18:00:00+02:00 1 18724 1417349 1132352 9428.30621 9331.89774
3 2015-03-29 19:00:00+02:00 NaN 0 0 0 0.00000 0.00000
4 2015-03-29 20:00:00+02:00 NaN 0 0 0 0.00000 0.00000

In [6]:
df.dateday = pd.to_datetime(df.dateday)
df.dateday


Out[6]:
0     2015-03-29 16:00:00+02:00
1     2015-03-29 17:00:00+02:00
2     2015-03-29 18:00:00+02:00
3     2015-03-29 19:00:00+02:00
4     2015-03-29 20:00:00+02:00
5     2015-03-29 21:00:00+02:00
6     2015-03-29 22:00:00+02:00
7     2015-03-29 23:00:00+02:00
8     2015-03-30 00:00:00+02:00
9     2015-03-30 01:00:00+02:00
10    2015-03-30 02:00:00+02:00
11    2015-03-30 03:00:00+02:00
12    2015-03-30 04:00:00+02:00
13    2015-03-30 05:00:00+02:00
14    2015-03-30 06:00:00+02:00
15    2015-03-30 07:00:00+02:00
16    2015-03-30 08:00:00+02:00
17    2015-03-30 09:00:00+02:00
18    2015-03-30 10:00:00+02:00
19    2015-03-30 11:00:00+02:00
20    2015-03-30 12:00:00+02:00
21    2015-03-30 13:00:00+02:00
22    2015-03-30 14:00:00+02:00
23    2015-03-30 15:00:00+02:00
24    2015-03-30 16:00:00+02:00
25    2015-03-30 17:00:00+02:00
26    2015-03-30 18:00:00+02:00
27    2015-03-30 19:00:00+02:00
28    2015-03-30 20:00:00+02:00
29    2015-03-30 21:00:00+02:00
30    2015-03-30 22:00:00+02:00
31    2015-03-30 23:00:00+02:00
32    2015-03-31 00:00:00+02:00
33    2015-03-31 01:00:00+02:00
34    2015-03-31 02:00:00+02:00
35    2015-03-31 03:00:00+02:00
36    2015-03-31 04:00:00+02:00
37    2015-03-31 05:00:00+02:00
38    2015-03-31 06:00:00+02:00
39    2015-03-31 07:00:00+02:00
40    2015-03-31 08:00:00+02:00
41    2015-03-31 09:00:00+02:00
42    2015-03-31 10:00:00+02:00
43    2015-03-31 11:00:00+02:00
44    2015-03-31 12:00:00+02:00
45    2015-03-31 13:00:00+02:00
46    2015-03-31 14:00:00+02:00
47    2015-03-31 15:00:00+02:00
48    2015-03-31 16:00:00+02:00
49    2015-03-31 17:00:00+02:00
50    2015-03-31 18:00:00+02:00
Name: dateday, dtype: object

In [7]:
df.set_index('dateday', inplace=True)

In [8]:
df.index[0], df.index[1]


Out[8]:
(Timestamp('2015-03-29 16:00:00+0200', tz='psycopg2.tz.FixedOffsetTimezone(offset=120, name=None)'),
 Timestamp('2015-03-29 17:00:00+0200', tz='psycopg2.tz.FixedOffsetTimezone(offset=120, name=None)'))

In [9]:
df = df.reindex(pd.date_range(min(df.index), max(df.index)), fill_value=0)

In [13]:
df['nbcalls']


Out[13]:
2015-03-29 16:00:00+02:00    0
2015-03-30 16:00:00+02:00    0
2015-03-31 16:00:00+02:00    0
Freq: D, Name: nbcalls, dtype: float64

In [12]:
df['nbcalls'].iloc[0]


Out[12]:
0.0

In [14]:
df.index.names


Out[14]:
FrozenList([None])

In [18]:
table = pd.tools.pivot.pivot_table(df, 
        values=['nbcalls', 'duration', 'billsec', 'buy_cost', 'sell_cost'], 
        index=['dateday'],
        columns=['dateday', 'switch_id'], fill_value=0)
table


Out[18]:
billsec buy_cost duration nbcalls sell_cost switch_id
dateday 0 0 0 0 0 0
switch_id 0 0 0 0 0 0

not using index_col in read_sql


In [19]:
df = sql.read_sql("""SELECT
    dateday,
    switch_id,
    coalesce(nbcalls,0) AS nbcalls,
    coalesce(duration,0) AS duration,
    coalesce(billsec,0) AS billsec,
    coalesce(buy_cost,0) AS buy_cost,
    coalesce(sell_cost,0) AS sell_cost
FROM
    generate_series(
                    date_trunc('hour', current_timestamp - interval '50' hour),
                    date_trunc('hour', current_timestamp - interval '30' hour),
                    '1 hour')
    as dateday
LEFT OUTER JOIN (
    SELECT
        date_trunc('hour', starting_date) as dayhour,
        switch_id as switch_id,
        SUM(nbcalls) as nbcalls,
        SUM(duration) as duration,
        SUM(billsec) as billsec,
        SUM(buy_cost) as buy_cost,
        SUM(sell_cost) as sell_cost
    FROM matv_voip_cdr_aggr_hour
    WHERE
        starting_date > date_trunc('hour', current_timestamp - interval '50' hour) and
        starting_date <= date_trunc('hour', current_timestamp - interval '30' hour)
        
    GROUP BY dayhour, switch_id
    ) results
ON (dateday = results.dayhour)""", engine)
print df
totals = df.groupby('switch_id').size()
print totals


                      dateday  switch_id  nbcalls  duration  billsec  \
0   2015-03-29 14:00:00+02:00        NaN        0         0        0   
1   2015-03-29 15:00:00+02:00        NaN        0         0        0   
2   2015-03-29 16:00:00+02:00        NaN        0         0        0   
3   2015-03-29 17:00:00+02:00        NaN        0         0        0   
4   2015-03-29 18:00:00+02:00          1    18724   1417349  1132352   
5   2015-03-29 19:00:00+02:00        NaN        0         0        0   
6   2015-03-29 20:00:00+02:00        NaN        0         0        0   
7   2015-03-29 21:00:00+02:00        NaN        0         0        0   
8   2015-03-29 22:00:00+02:00        NaN        0         0        0   
9   2015-03-29 23:00:00+02:00        NaN        0         0        0   
10  2015-03-30 00:00:00+02:00        NaN        0         0        0   
11  2015-03-30 01:00:00+02:00        NaN        0         0        0   
12  2015-03-30 02:00:00+02:00        NaN        0         0        0   
13  2015-03-30 03:00:00+02:00        NaN        0         0        0   
14  2015-03-30 04:00:00+02:00        NaN        0         0        0   
15  2015-03-30 05:00:00+02:00        NaN        0         0        0   
16  2015-03-30 06:00:00+02:00        NaN        0         0        0   
17  2015-03-30 07:00:00+02:00        NaN        0         0        0   
18  2015-03-30 08:00:00+02:00        NaN        0         0        0   
19  2015-03-30 09:00:00+02:00        NaN        0         0        0   
20  2015-03-30 10:00:00+02:00        NaN        0         0        0   

      buy_cost   sell_cost  
0      0.00000     0.00000  
1      0.00000     0.00000  
2      0.00000     0.00000  
3      0.00000     0.00000  
4   9428.30621  9331.89774  
5      0.00000     0.00000  
6      0.00000     0.00000  
7      0.00000     0.00000  
8      0.00000     0.00000  
9      0.00000     0.00000  
10     0.00000     0.00000  
11     0.00000     0.00000  
12     0.00000     0.00000  
13     0.00000     0.00000  
14     0.00000     0.00000  
15     0.00000     0.00000  
16     0.00000     0.00000  
17     0.00000     0.00000  
18     0.00000     0.00000  
19     0.00000     0.00000  
20     0.00000     0.00000  
switch_id
1    1
dtype: int64

In [25]:
df.update(df.switch_id.fillna(0))
df


Out[25]:
dateday switch_id nbcalls duration billsec buy_cost sell_cost
0 2015-03-29 14:00:00+02:00 0 0 0 0 0.00000 0.00000
1 2015-03-29 15:00:00+02:00 0 0 0 0 0.00000 0.00000
2 2015-03-29 16:00:00+02:00 0 0 0 0 0.00000 0.00000
3 2015-03-29 17:00:00+02:00 0 0 0 0 0.00000 0.00000
4 2015-03-29 18:00:00+02:00 1 18724 1417349 1132352 9428.30621 9331.89774
5 2015-03-29 19:00:00+02:00 0 0 0 0 0.00000 0.00000
6 2015-03-29 20:00:00+02:00 0 0 0 0 0.00000 0.00000
7 2015-03-29 21:00:00+02:00 0 0 0 0 0.00000 0.00000
8 2015-03-29 22:00:00+02:00 0 0 0 0 0.00000 0.00000
9 2015-03-29 23:00:00+02:00 0 0 0 0 0.00000 0.00000
10 2015-03-30 00:00:00+02:00 0 0 0 0 0.00000 0.00000
11 2015-03-30 01:00:00+02:00 0 0 0 0 0.00000 0.00000
12 2015-03-30 02:00:00+02:00 0 0 0 0 0.00000 0.00000
13 2015-03-30 03:00:00+02:00 0 0 0 0 0.00000 0.00000
14 2015-03-30 04:00:00+02:00 0 0 0 0 0.00000 0.00000
15 2015-03-30 05:00:00+02:00 0 0 0 0 0.00000 0.00000
16 2015-03-30 06:00:00+02:00 0 0 0 0 0.00000 0.00000
17 2015-03-30 07:00:00+02:00 0 0 0 0 0.00000 0.00000
18 2015-03-30 08:00:00+02:00 0 0 0 0 0.00000 0.00000
19 2015-03-30 09:00:00+02:00 0 0 0 0 0.00000 0.00000
20 2015-03-30 10:00:00+02:00 0 0 0 0 0.00000 0.00000

In [26]:
table = pd.tools.pivot.pivot_table(df,
        values=['nbcalls', 'duration', 'billsec', 'buy_cost', 'sell_cost'],
        index=['dateday'],
        columns=['switch_id'],
        fill_value=0)
table


Out[26]:
nbcalls duration billsec buy_cost sell_cost
switch_id 0 1 0 1 0 1 0 1 0 1
dateday
2015-03-29 14:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-29 15:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-29 16:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-29 17:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-29 18:00:00+02:00 0 18724 0 1417349 0 1132352 0 9428.30621 0 9331.89774
2015-03-29 19:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-29 20:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-29 21:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-29 22:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-29 23:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-30 00:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-30 01:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-30 02:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-30 03:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-30 04:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-30 05:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-30 06:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-30 07:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-30 08:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-30 09:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000
2015-03-30 10:00:00+02:00 0 0 0 0 0 0 0 0.00000 0 0.00000

In [15]:
table.dtypes


Out[15]:
           switch_id
nbcalls    1              int64
           2              int64
duration   1              int64
           2              int64
billsec    1              int64
           2              int64
buy_cost   1            float64
           2            float64
sell_cost  1            float64
           2            float64
dtype: object

In [16]:
table.nbcalls


Out[16]:
switch_id 1.0 2.0
dateday
2015-03-19 23:00:00+01:00 452 433
2015-03-20 00:00:00+01:00 435 407
2015-03-20 01:00:00+01:00 446 427
2015-03-20 02:00:00+01:00 436 384
2015-03-20 03:00:00+01:00 417 479
2015-03-20 04:00:00+01:00 421 419
2015-03-20 05:00:00+01:00 473 389
2015-03-20 06:00:00+01:00 469 431
2015-03-20 07:00:00+01:00 456 430
2015-03-20 08:00:00+01:00 424 449
2015-03-20 09:00:00+01:00 423 456
2015-03-20 10:00:00+01:00 424 435
2015-03-20 11:00:00+01:00 425 410
2015-03-20 12:00:00+01:00 435 454
2015-03-20 13:00:00+01:00 361 387
2015-03-20 18:00:00+01:00 18758 0

In [17]:
table.nbcalls.describe()


Out[17]:
1.0 2.0
count 16.000000 16.000000
mean 1578.437500 399.375000
std 4581.288566 109.741439
min 361.000000 0.000000
25% 423.750000 402.500000
50% 435.000000 428.500000
75% 453.000000 438.500000
max 18758.000000 479.000000

In [142]:
table


Out[142]:
nbcalls duration billsec buy_cost sell_cost
switch_id 1 1 1 1 1
dateday
2015-03-21 18:00:00+01:00 18691 1411681 1129709 9299 9256.68949

In [143]:
table.describe()


Out[143]:
nbcalls duration billsec buy_cost sell_cost
1 1 1 1 1
count 1 1 1 1 1.00000
mean 18691 1411681 1129709 9299 9256.68949
std NaN NaN NaN NaN NaN
min 18691 1411681 1129709 9299 9256.68949
25% 18691 1411681 1129709 9299 9256.68949
50% 18691 1411681 1129709 9299 9256.68949
75% 18691 1411681 1129709 9299 9256.68949
max 18691 1411681 1129709 9299 9256.68949

Transpose


In [13]:
table.nbcalls.unstack(0)


Out[13]:
switch_id  dateday                  
1          2015-03-19 13:00:00+01:00      163
           2015-03-19 14:00:00+01:00      549
           2015-03-19 15:00:00+01:00      558
           2015-03-19 16:00:00+01:00      450
           2015-03-19 17:00:00+01:00      407
           2015-03-19 18:00:00+01:00    19110
           2015-03-19 19:00:00+01:00      461
           2015-03-19 20:00:00+01:00      452
2          2015-03-19 13:00:00+01:00       66
           2015-03-19 14:00:00+01:00      431
           2015-03-19 15:00:00+01:00      423
           2015-03-19 16:00:00+01:00      464
           2015-03-19 17:00:00+01:00      454
           2015-03-19 18:00:00+01:00      432
           2015-03-19 19:00:00+01:00      418
           2015-03-19 20:00:00+01:00      469
dtype: int64

In [40]:
ntable = table.nbcalls.T
ntable


Out[40]:
dateday 2015-03-19 22:00:00+01:00 2015-03-19 23:00:00+01:00 2015-03-20 00:00:00+01:00 2015-03-20 01:00:00+01:00 2015-03-20 02:00:00+01:00 2015-03-20 03:00:00+01:00 2015-03-20 04:00:00+01:00 2015-03-20 05:00:00+01:00 2015-03-20 06:00:00+01:00 2015-03-20 07:00:00+01:00 2015-03-20 08:00:00+01:00 2015-03-20 09:00:00+01:00 2015-03-20 10:00:00+01:00 2015-03-20 11:00:00+01:00 2015-03-20 12:00:00+01:00 2015-03-20 17:00:00+01:00
switch_id
1 452 435 446 436 417 421 473 469 456 424 423 424 425 435 361 18758
2 433 407 427 384 479 419 389 431 430 449 456 435 410 454 387 0

In [44]:
ntable.columns


Out[44]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2015-03-19 23:00:00+01:00, ..., 2015-03-20 18:00:00+01:00]
Length: 16, Freq: None, Timezone: psycopg2.tz.FixedOffsetTimezone(offset=60, name=None)

In [69]:
table['nbcalls'].T


Out[69]:
dateday 2015-03-19 12:00:00+01:00 2015-03-19 13:00:00+01:00 2015-03-19 14:00:00+01:00 2015-03-19 15:00:00+01:00 2015-03-19 16:00:00+01:00 2015-03-19 17:00:00+01:00 2015-03-19 18:00:00+01:00 2015-03-19 19:00:00+01:00
switch_id
1 163 712 1270 1720 2127 21237 21698 22150
2 66 497 920 1384 1838 2270 2688 3157

In [ ]:


In [83]:
first_column = table.nbcalls.columns.tolist()[0]
second_column = table.nbcalls.columns.tolist()[1]
num_switch = len(table.nbcalls.columns.tolist())
(first_column, second_column, num_switch, table.nbcalls.columns.tolist())


Out[83]:
(1.0, 2.0, 2, [1.0, 2.0])

In [87]:
ntable.loc[1.0], ntable.loc[2.0]


Out[87]:
(dateday
 2015-03-19 13:00:00+01:00      163
 2015-03-19 14:00:00+01:00      712
 2015-03-19 15:00:00+01:00     1270
 2015-03-19 16:00:00+01:00     1720
 2015-03-19 17:00:00+01:00     2127
 2015-03-19 18:00:00+01:00    21237
 2015-03-19 19:00:00+01:00    21698
 2015-03-19 20:00:00+01:00    22150
 Name: 1.0, dtype: float64, dateday
 2015-03-19 13:00:00+01:00      66
 2015-03-19 14:00:00+01:00     497
 2015-03-19 15:00:00+01:00     920
 2015-03-19 16:00:00+01:00    1384
 2015-03-19 17:00:00+01:00    1838
 2015-03-19 18:00:00+01:00    2270
 2015-03-19 19:00:00+01:00    2688
 2015-03-19 20:00:00+01:00    3157
 Name: 2.0, dtype: float64)

In [91]:
for i in ntable.loc[second_column]: print i


66.0
497.0
920.0
1384.0
1838.0
2270.0
2688.0
3157.0

In [90]:
list(ntable.loc[second_column])


Out[90]:
[66.0, 497.0, 920.0, 1384.0, 1838.0, 2270.0, 2688.0, 3157.0]

In [93]:
for i in table.index: print i


2015-03-19 17:00:00+01:00
2015-03-19 18:00:00+01:00
2015-03-19 19:00:00+01:00
2015-03-19 20:00:00+01:00
2015-03-19 21:00:00+01:00
2015-03-19 22:00:00+01:00
2015-03-19 23:00:00+01:00
2015-03-20 00:00:00+01:00

In [92]:
table.nbcalls.values


Out[92]:
array([[19110,   432],
       [  461,   418],
       [  452,   469],
       [  437,   402],
       [  442,   446],
       [  452,   433],
       [  435,   407],
       [  446,   427]])

In [17]:
table = table.cumsum()

In [18]:
import matplotlib.pyplot as plt

In [19]:
plt.figure(); table.plot(); plt.legend(loc='best')


Out[19]:
<matplotlib.legend.Legend at 0x5260a10>

In [37]:
table.nbcalls.columns.tolist()


Out[37]:
2.0