In [1]:
#initialize
%matplotlib inline
%qtconsole
In [ ]:
In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
from pandas import Timestamp
randn = np.random.randn
from pandas import *
In [3]:
#s = Series(data, index=index)
s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e'])
s
s.index
Out[3]:
In [4]:
#From dict
d = {'a' : 0.0, 'b' : 1.1, 'c' : 2.2 }
Series(d)
Series(d, index = ['a','b'])
Series(d, index = ['a','b','z'])
Series(5, index = ['a', 'b', 'c', 'd'])
Out[4]:
In [64]:
s['a']
s[:3]
s[[4,3,1]]
s[s> s.median()]
s>s.median()
Out[64]:
In [62]:
np.exp(s)
Out[62]:
In [66]:
s['a'] = 12
s
Out[66]:
In [68]:
'ee' in s
s['ee']
In [74]:
s.get('a')
s.get('f', np.Inf)
Out[74]:
In [80]:
s + s
s*2
np.exp(s)
s[1:] + s[:-1]
Out[80]:
In [85]:
s = Series(np.random.randn(5), dtype = int32, name = 'whatever')
type(s)
Out[85]:
DataFrame accepts many different kinds of inputs:
In [61]:
d = {'one' : Series([1., 2., 3.], index=['a', 'b', 'c']),
'two' : Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
In [62]:
df = DataFrame(d)
df
Out[62]:
In [24]:
d2 = DataFrame(d, index=['d', 'b', 'a'])
d2['one']['d']
#d2[1][2]??
Out[24]:
In [27]:
df.index
df.columns
Out[27]:
In [30]:
d = {'one' : [1., 2., 3., 4.],
'two' : [4., 3., 2., 1.]}
In [32]:
DataFrame(d, index=['a', 'b', 'c', 'd'])
Out[32]:
In [14]:
#This case is handled identically to a dict of arrays
data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'),('C','a10')])
In [15]:
data[:] = [(1,2.,'Hello'),(2,3.,"World")]
In [16]:
DataFrame(data)
Out[16]:
In [22]:
DataFrame(data, index=['first', 'second'])
Out[22]:
In [24]:
DataFrame(data,columns=['C','A','B'])
Out[24]:
In [25]:
data2 = [{'a':1, 'b':2}, {'a':5,'b':10, 'c':20}]
In [36]:
DataFrame(data2, index = ['first','second'])
Out[36]:
In [37]:
DataFrame(data2, columns=['a','b']) # A selection
Out[37]:
In [42]:
df['one']
Out[42]:
In [43]:
df['three'] = df['one']*df['two']
In [49]:
boolindex = df['two']>1
boolindex
Out[49]:
In [50]:
df[boolindex]
Out[50]:
In [52]:
df['b']
In [53]:
del df['two']
In [54]:
df
Out[54]:
In [55]:
df['new'] = 1
In [56]:
df
Out[56]:
In [59]:
df.ix[3]
Out[59]:
In [64]:
df[1]
5.2.9 DataTypes
In [75]:
df['interger'] = 1
df['int32'] = df['interger'].astype('int32')
df['float32'] = Series([1.0]*len(df),dtype='float32')
df['timestamp'] = Timestamp('20010102')
In [77]:
df.dtypes
Out[77]:
In [3]:
index_df = date_range('1', periods=10) #use data range as index, otherwise, Boom!!
df = DataFrame(randn(10, 4), index=index_df, columns=['A', 'B', 'C', 'D'])
df - df['A']
(df.T - df['A']).T
Out[3]:
In [7]:
df - df['A']
df * 5 + 2
1/df
df ** 4
Out[7]:
In [8]:
df[:5]
Out[8]:
In [9]:
df[:5].T
Out[9]:
In [10]:
np.exp(df)
Out[10]:
In [11]:
np.asarray(df)
Out[11]:
In [13]:
df.T.dot(df)
Out[13]:
In [14]:
s1 = Series(np.arange(5,10))
s1.dot(s1)
Out[14]:
In [15]:
#baseball = read_csv('a_large_data_set.csv')
#print baseball (only have summary)
#print baseball.ix[-20:, :12].to_string() represant as a tabulate formate
In [21]:
DataFrame(randn(3,12))
Out[21]:
In [24]:
df.C
Out[24]:
In [30]:
long_series = Series(randn(1000))
long_series.head()
long_series.tail(3)
Out[30]:
In [1]:
long_series.values;
In [33]:
df
Out[33]:
In [37]:
df.mean(0)
Out[37]:
In [38]:
df.mean(1)
Out[38]:
In [39]:
df.sum(0, skipna=True)
Out[39]:
In [42]:
df.std()
Out[42]:
In [46]:
df.cumsum()
Out[46]:
In [48]:
long_series.describe()
Out[48]:
In [49]:
long_series.idxmax()
Out[49]:
In [50]:
long_series.idxmin()
Out[50]:
In [52]:
long_series.ix[229]
Out[52]:
In [56]:
df.idxmax(axis=1)
Out[56]:
In [57]:
df.apply(np.mean)
Out[57]:
In [65]:
df = DataFrame(randn(8,4), index=range(1, 9), columns=range(1,9,2))
In [66]:
df.ix[1]
Out[66]:
In [67]:
df
Out[67]:
In [68]:
df.index=range(8)
In [69]:
df
Out[69]:
In [ ]: