notebook.community

Edit and run



In [6]:

    
from pandas import Series, DataFrame
import pandas as pd
import numpy as np



In [2]:

    
obj = Series([1,2,3])
obj.values









    Out[2]:





array([1, 2, 3])



In [3]:

    
obj.index









    Out[3]:





Int64Index([0, 1, 2], dtype=int64)



In [4]:

    
obj = Series([4,5,6], index=list("abc"))
obj









    Out[4]:





a    4
b    5
c    6
dtype: int64



In [5]:

    
obj[['a','b']]









    Out[5]:





a    4
b    5
dtype: int64



In [7]:

    
np.exp(obj)









    Out[7]:





a     54.598150
b    148.413159
c    403.428793
dtype: float64



In [9]:

    
st=list("abe")
x2 = Series(obj, index=st)
x2









    Out[9]:





a     4
b     5
e   NaN
dtype: float64



In [10]:

    
x2.isnull()









    Out[10]:





a    False
b    False
e     True
dtype: bool



In [11]:

    
x2[x2.notnull()]









    Out[11]:





a    4
b    5
dtype: float64



In [12]:

    
obj.name = 'test'
obj









    Out[12]:





a    4
b    5
c    6
Name: test, dtype: int64



In [13]:

    
obj.index = ['g','h','i']



In [15]:

    
# Data Frames



In [16]:

    
data = {'st':['ct','ny','ca'],
        'year':[2008,2009,2010], 'pop':[1.5, 1.2, 1.8]}
data









    Out[16]:





{'pop': [1.5, 1.2, 1.8], 'st': ['ct', 'ny', 'ca'], 'year': [2008, 2009, 2010]}



In [17]:

    
df = DataFrame(data)
df



In [18]:

    
# Re-arrange columns
DataFrame(data, columns=['year','st','pop'])



In [24]:

    
# Re-arrange columns + Re-index
df = DataFrame(data, columns=['year','st','pop'],
          index = ['a','b','c'])



In [25]:

    
df.year









    Out[25]:





a    2008
b    2009
c    2010
Name: year, dtype: int64



In [26]:

    
df['year']









    Out[26]:





a    2008
b    2009
c    2010
Name: year, dtype: int64



In [27]:

    
# Accessing Rows with ix
df.ix[0]









    Out[27]:





year    2008
st        ct
pop      1.5
Name: a, dtype: object



In [28]:

    
df.ix['a']









    Out[28]:





year    2008
st        ct
pop      1.5
Name: a, dtype: object



In [29]:

    
df['extra'] = 100
df



In [31]:

    
df['extra'] = np.arange(3)
df



In [33]:

    
df['isct'] = df['st'] == "ct"
df



In [37]:

    
DataFrame(df, index=['a','b','e'])



In [38]:

    
df.index.name = "alpha"
df.columns.name = "info"
df



In [39]:

    
df.values









    Out[39]:





array([[2008, 'ct', 1.5, 0, True],
       [2009, 'ny', 1.2, 1, False],
       [2010, 'ca', 1.8, 2, False]], dtype=object)



In [40]:

    
df.index









    Out[40]:





Index([u'a', u'b', u'c'], dtype=object)



In [41]:

    
df.index[0]









    Out[41]:





'a'



In [42]:

    
df.index[0] = "new" # Index objects are immutable









    



---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-42-b411703d64b6> in <module>()
----> 1 df.index[0] = "new" # Index objects are immutable

/usr/local/lib/python2.7/site-packages/pandas/core/index.pyc in __setitem__(self, key, value)
    328 
    329     def __setitem__(self, key, value):
--> 330         raise Exception(str(self.__class__) + ' object is immutable')
    331 
    332     def __getitem__(self, key):

Exception: <class 'pandas.core.index.Index'> object is immutable



In [43]:

    
df.columns









    Out[43]:





Index([u'year', u'st', u'pop', u'extra', u'isct'], dtype=object)



In [44]:

    
'pop' in df.columns









    Out[44]:





True



In [45]:

    
df['pop']









    Out[45]:





alpha
a        1.5
b        1.2
c        1.8
Name: pop, dtype: float64



In [47]:

    
df[df.columns['pop' in df.columns]]









    Out[47]:





alpha
a        ct
b        ny
c        ca
Name: st, dtype: object



In [48]:

    
df.index









    Out[48]:





Index([u'a', u'b', u'c'], dtype=object)



In [49]:

    
# Re-arrange rows
df.reindex(['c','b','a'])



In [50]:

    
df.reindex(['c','b','a','e'], fill_value=0)



In [51]:

    
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method="ffill") # Forward Fill Missing









    Out[51]:





0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object



In [61]:

    
df.drop('c')



In [62]:

    
df.drop('isct', axis=1)



In [64]:

    
# Slicing with labels
# The endpoint is included
df['a':'b']



In [65]:

    
df[['year','st']]



In [66]:

    
# Indexing by Rows + Columns
df.ix['a':'b',['year','st','pop']]



In [68]:

    
# In operations, values that do not overlap are marked with NaN
a1 = pd.DataFrame([1,2,3], index=['a','b','c'])
a2 = pd.DataFrame([10,20,30], index=['a','b','d'])
a1 + a2



In [69]:

    
# Applying a function on 1D Arrays to each column
obj = DataFrame(np.random.randn(4, 3), columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])

f = lambda x: x.max() - x.min()
obj



In [70]:

    
# Similar to R apply
obj.apply(f)









    Out[70]:





b    3.489094
d    2.039268
e    1.033818
dtype: float64



In [71]:

    
format = lambda x: '%.2f' %x
obj.applymap(format)



In [72]:

    
obj.sort_index()



In [74]:

    
obj.sort_index(axis=1, ascending=False)



In [76]:

    
# To sort by values, use order
obj['e'].order()









    Out[76]:





Ohio     -0.526920
Utah     -0.161616
Texas     0.244920
Oregon    0.506898
Name: e, dtype: float64



In [78]:

    
# To sort a DataFrame by values, use order
obj.sort_index(by=['e','d'])



In [81]:

    
obj['e'].rank(method="first") # Rank with tie break









    Out[81]:





Utah      2
Ohio      1
Texas     3
Oregon    4
Name: e, dtype: float64



In [84]:

    
ser = Series([1,1,2,10,3,5], index=list('aabcde'))
ser.index.is_unique









    Out[84]:





False



In [85]:

    
obj



In [86]:

    
obj.sum()









    Out[86]:





b    0.665667
d   -0.065139
e    0.063281
dtype: float64



In [88]:

    
obj.sum(axis=1, skipna=True)









    Out[88]:





Utah      2.285812
Ohio      0.913822
Texas    -0.614797
Oregon   -1.921028
dtype: float64



In [89]:

    
obj.describe()



In [91]:

    
obj.unstack()









    Out[91]:





b  Utah      1.506451
   Ohio      0.903284
   Texas     0.238575
   Oregon   -1.982642
d  Utah      0.940977
   Ohio      0.537459
   Texas    -1.098292
   Oregon   -0.445283
e  Utah     -0.161616
   Ohio     -0.526920
   Texas     0.244920
   Oregon    0.506898
dtype: float64



In [94]:

    
print ser
print '----'
ser.value_counts()









    



a     1
a     1
b     2
c    10
d     3
e     5
dtype: int64
----






    Out[94]:





1     2
5     1
3     1
10    1
2     1
dtype: int64



In [95]:

    
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data









    Out[95]:





0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64



In [96]:

    
data.dropna()









    Out[96]:





0    1.0
2    3.5
4    7.0
dtype: float64



In [98]:

    
# But drop NA drops any row that has NA
# With DataFrames, the requirement might be differenet

data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data



In [100]:

    
# Drop only if all values in the row is an NA
data.dropna(axis=0, how='all')



In [101]:

    
data.fillna(0)



In [103]:

    
# However, fillna returns a new object
# You can modify in-place using _

_ = data.fillna(0, inplace=True)
data



In [104]:

    
# Hierarchical Indexing
data = Series(np.random.randn(10), 
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data









    Out[104]:





a  1    0.239378
   2   -0.895084
   3   -0.266205
b  1    1.244437
   2   -0.091399
   3    1.618462
c  1    0.770874
   2    2.563309
d  2   -0.420505
   3   -1.692568
dtype: float64



In [105]:

    
# Shows that data has a multi index
data.index









    Out[105]:





MultiIndex
[(u'a', 1), (u'a', 2), (u'a', 3), (u'b', 1), (u'b', 2), (u'b', 3), (u'c', 1), (u'c', 2), (u'd', 2), (u'd', 3)]



In [107]:

    
# Shows all values of a the outer index
data['a',]









    Out[107]:





1    0.239378
2   -0.895084
3   -0.266205
dtype: float64



In [109]:

    
# Shows the values as 1 for each of a, b and c outer index
data[:,1]









    Out[109]:





a    0.239378
b    1.244437
c    0.770874
dtype: float64



In [110]:

    
data.unstack()



In [111]:

    
# Hierarchical Indexing in Data Frames
df = DataFrame(np.arange(12).reshape((4, 3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
df



In [112]:

    
# The hierarchical levels can have names (as strings or 
# any Python objects). If so, these will show up in the console output (don’t confuse the index names with the axis labels!):

df.index.names = ['key1','key2']
df.columns.names = ['state', 'colour']
df



In [113]:

    
# Changing the order of the levels
df.sortlevel(1)



In [114]:

    
df.swaplevel('key1','key2')



In [115]:

    
df.sum(level='key2')



In [116]:

    
df.sum(level='colour', axis = 1)



In [118]:

    
obj



In [119]:

    
# Setting an index from a column name
obj.set_index('b')



In [120]:

    
obj.set_index('b', drop=False)



In [121]:

    
obj



In [122]:

    
_ = obj.set_index('b', inplace=True)



In [123]:

    
obj



In [124]:



In [ ]:

	b	d	e
count	4.000000	4.000000	4.000000
mean	0.166417	-0.016285	0.015820
std	1.523409	0.926959	0.454494
min	-1.982642	-1.098292	-0.526920
25%	-0.316730	-0.608535	-0.252942
50%	0.570929	0.046088	0.041652
75%	1.054076	0.638338	0.310414
max	1.506451	0.940977	0.506898

	b	d	e
Utah	1.506451	0.940977	-0.161616
Ohio	0.903284	0.537459	-0.526920
Texas	0.238575	-1.098292	0.244920
Oregon	-1.982642	-0.445283	0.506898

	b	d	e
Utah	1.51	0.94	-0.16
Ohio	0.90	0.54	-0.53
Texas	0.24	-1.10	0.24
Oregon	-1.98	-0.45	0.51

	1	2	3
a	0.239378	-0.895084	-0.266205
b	1.244437	-0.091399	1.618462
c	0.770874	2.563309	NaN
d	NaN	-0.420505	-1.692568