In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [43]:
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine 
import pandas as pd
from pandas.io import sql
from pandas.io.sql import read_sql
import datetime as dt
import numpy as np

In [44]:
engine = create_engine('postgresql://postgres:password@localhost:5433/cdrstats-billing')

In [45]:
result = engine.execute("select * from events")
result


Out[45]:
<sqlalchemy.engine.result.ResultProxy at 0x64dd9d0>

In [84]:
query = """\
SELECT 
    date(starting_date) as date, 
    count(*) as count
FROM events
GROUP BY 1"""
df = read_sql(query, engine)

In [85]:
df.head()


Out[85]:
date count
0 2015-03-17 1
1 2015-02-27 3
2 2015-02-28 1
3 2015-03-16 1
4 2015-03-06 1

In [86]:
df.plot()


Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x65f4250>

In [87]:
df.date = pd.to_datetime(df.date)
df.set_index('date', inplace=True)
df = df.reindex(pd.date_range(min(df.index), max(df.index)), fill_value=0)

In [88]:
pd.date_range(min(df.index), max(df.index))


Out[88]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2015-02-24, ..., 2015-03-23]
Length: 28, Freq: D, Timezone: None

In [89]:
df.plot()


Out[89]:
<matplotlib.axes._subplots.AxesSubplot at 0x6ca5790>

In [90]:
import matplotlib.pyplot as plt
import numpy as np

In [91]:
x = np.linspace(0, 3*np.pi, 500)
plt.plot(x, np.sin(x**2))
plt.title('A simple chirp');



In [98]:
a = np.arange(100, 150)
a24 = np.random.choice(a, size=23)
a24


Out[98]:
array([112, 130, 144, 138, 118, 147, 114, 138, 101, 116, 122, 130, 103,
       105, 147, 145, 109, 112, 136, 117, 103, 126, 106])

In [99]:
s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D'))
s


Out[99]:
0   2012-01-01
1   2012-01-02
2   2012-01-03
dtype: datetime64[ns]

In [100]:
type(s)


Out[100]:
pandas.core.series.Series

In [101]:
td = pd.Series([ pd.Timedelta(hours=i) for i in range(23) ])
td


Out[101]:
0    00:00:00
1    01:00:00
2    02:00:00
3    03:00:00
4    04:00:00
5    05:00:00
6    06:00:00
7    07:00:00
8    08:00:00
9    09:00:00
10   10:00:00
11   11:00:00
12   12:00:00
13   13:00:00
14   14:00:00
15   15:00:00
16   16:00:00
17   17:00:00
18   18:00:00
19   19:00:00
20   20:00:00
21   21:00:00
22   22:00:00
dtype: timedelta64[ns]

In [185]:
tda = pd.Series(a24)
data = {'hour': td, 'value': tda}
df = pd.DataFrame(data=data)
df.head()


Out[185]:
hour value
0 00:00:00 112
1 01:00:00 130
2 02:00:00 144
3 03:00:00 138
4 04:00:00 118

In [147]:
td = pd.Series([ pd.Timedelta(hours=i) for i in range(24) ])
td.head()


Out[147]:
0   00:00:00
1   01:00:00
2   02:00:00
3   03:00:00
4   04:00:00
dtype: timedelta64[ns]

In [148]:
df.head()
df.value.tail()


Out[148]:
19    117
20    103
21    126
22    106
23    NaN
Name: value, dtype: float64

In [166]:
# data2 = df.tail()
data2 = df
# data2

We will now update the NaN values. In order to update a serie correctly, you need to apply update with an other series. sometimes it's confusing if you are working with serie or dataframe, use type() to find out


In [196]:
ser = data2['value']
index = ser[ser.isnull()].index
df_upd = pd.DataFrame(np.random.randn(len(index)), index=index, columns=['value'])
ser.update(df_upd.value)
type(df_upd.value)
ser
# df_upd


Out[196]:
0     112.000000
1     130.000000
2     144.000000
3     138.000000
4     118.000000
5     147.000000
6     114.000000
7     138.000000
8     101.000000
9     116.000000
10    122.000000
11    130.000000
12    103.000000
13    105.000000
14    147.000000
15    145.000000
16    109.000000
17    112.000000
18    136.000000
19    117.000000
20    103.000000
21    126.000000
22    106.000000
23      0.713658
Name: value, dtype: float64

In [184]:
# df.add(td, axis=df.hour, fill_value=1)

In [ ]: