notebook.community

Edit and run



In [148]:

    
import pandas as pd
import numpy as np



In [4]:

    
obj2 = pd.Series(arange(5), index=['a','b','c','d','e'])



In [5]:

    
obj2.values









    Out[5]:





array([0, 1, 2, 3, 4])



In [6]:

    
obj2.index









    Out[6]:





Index([u'a', u'b', u'c', u'd', u'e'], dtype='object')



In [7]:

    
obj2









    Out[7]:





a    0
b    1
c    2
d    3
e    4
dtype: int32



In [8]:

    
obj2[['a','c']]









    Out[8]:





a    0
c    2
dtype: int32



In [10]:

    
obj2[obj2<2]









    Out[10]:





a    0
b    1
dtype: int32



In [11]:

    
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}



In [20]:

    
obj3=pd.Series(sdata)



In [21]:

    
states = ['California', 'Ohio', 'Oregon', 'Texas']



In [22]:

    
obj4 = pd.Series(sdata, index=states)



In [15]:

    
obj4









    Out[15]:





California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64



In [16]:

    
obj4.isnull()









    Out[16]:





California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool



In [17]:

    
pd.notnull(obj4)









    Out[17]:





California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool



In [18]:

    
pd.isnull(obj4)









    Out[18]:





California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool



In [24]:

    
obj4









    Out[24]:





California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64



In [25]:

    
obj3+ obj4









    Out[25]:





California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64



In [26]:

    
obj3









    Out[26]:





Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64



In [27]:

    
obj4









    Out[27]:





California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64



In [28]:

    
obj4['California']=40000



In [29]:

    
obj3+obj4









    Out[29]:





California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64



In [31]:

    
obj4.name = 'population'
obj4.index.name='state'



In [32]:

    
obj4









    Out[32]:





state
California    40000
Ohio          35000
Oregon        16000
Texas         71000
Name: population, dtype: float64



In [40]:

    
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}



In [41]:

    
data









    Out[41]:





{'pop': [1.5, 1.7, 3.6, 2.4, 2.9],
 'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002]}



In [42]:

    
frame = pd.DataFrame(data)



In [36]:

    
frame









    Out[36]:






  
    
      
      pop
      state
      year
    
  
  
    
      0
       1.5
         Ohio
       2000
    
    
      1
       1.7
         Ohio
       2001
    
    
      2
       3.6
         Ohio
       2002
    
    
      3
       2.4
       Nevada
       2001
    
    
      4
       2.9
       Nevada
       2002
    
  

5 rows × 3 columns



In [50]:

    
pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'])









    Out[50]:






  
    
      
      year
      state
      pop
      debt
    
  
  
    
      0
       2000
         Ohio
       1.5
       NaN
    
    
      1
       2001
         Ohio
       1.7
       NaN
    
    
      2
       2002
         Ohio
       3.6
       NaN
    
    
      3
       2001
       Nevada
       2.4
       NaN
    
    
      4
       2002
       Nevada
       2.9
       NaN
    
  

5 rows × 4 columns



In [51]:

    
frame['year']









    Out[51]:





0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64



In [52]:

    
frame.year









    Out[52]:





0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64



In [53]:

    
frame.ix[3]









    Out[53]:





pop         2.4
state    Nevada
year       2001
Name: 3, dtype: object



In [56]:

    
'pop' in frame.columns









    Out[56]:





True



In [59]:

    
3 in frame.index









    Out[59]:





True



In [64]:

    
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])



In [63]:

    
obj3.reindex(range(6), method='ffill')









    Out[63]:





0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object



In [65]:

    
obj3









    Out[65]:





0      blue
2    purple
4    yellow
dtype: object



In [67]:

    
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])



In [69]:

    
obj.drop('b')









    Out[69]:





a    0
c    2
d    3
e    4
dtype: float64



In [71]:

    
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])



In [72]:

    
data









    Out[72]:






  
    
      
      one
      two
      three
      four
    
  
  
    
      Ohio
        0
        1
        2
        3
    
    
      Colorado
        4
        5
        6
        7
    
    
      Utah
        8
        9
       10
       11
    
    
      New York
       12
       13
       14
       15
    
  

4 rows × 4 columns



In [75]:

    
data.drop(['one'], axis=1)









    Out[75]:






  
    
      
      two
      three
      four
    
  
  
    
      Ohio
        1
        2
        3
    
    
      Colorado
        5
        6
        7
    
    
      Utah
        9
       10
       11
    
    
      New York
       13
       14
       15
    
  

4 rows × 3 columns



In [77]:

    
from pandas import *
obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])



In [81]:

    
obj[['a','c']]









    Out[81]:





a    0
c    2
dtype: float64



In [82]:

    
obj[2:4]









    Out[82]:





c    2
d    3
dtype: float64



In [83]:

    
obj









    Out[83]:





a    0
b    1
c    2
d    3
dtype: float64



In [84]:

    
obj[obj<2]









    Out[84]:





a    0
b    1
dtype: float64



In [85]:

    
data = DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])



In [86]:

    
data









    Out[86]:






  
    
      
      one
      two
      three
      four
    
  
  
    
      Ohio
        0
        1
        2
        3
    
    
      Colorado
        4
        5
        6
        7
    
    
      Utah
        8
        9
       10
       11
    
    
      New York
       12
       13
       14
       15
    
  

4 rows × 4 columns



In [88]:

    
data[['two','four']]









    Out[88]:






  
    
      
      two
      four
    
  
  
    
      Ohio
        1
        3
    
    
      Colorado
        5
        7
    
    
      Utah
        9
       11
    
    
      New York
       13
       15
    
  

4 rows × 2 columns



In [90]:

    
data[data['three']>5]









    Out[90]:






  
    
      
      one
      two
      three
      four
    
  
  
    
      Colorado
        4
        5
        6
        7
    
    
      Utah
        8
        9
       10
       11
    
    
      New York
       12
       13
       14
       15
    
  

3 rows × 4 columns



In [91]:

    
data <5









    Out[91]:






  
    
      
      one
      two
      three
      four
    
  
  
    
      Ohio
        True
        True
        True
        True
    
    
      Colorado
        True
       False
       False
       False
    
    
      Utah
       False
       False
       False
       False
    
    
      New York
       False
       False
       False
       False
    
  

4 rows × 4 columns



In [92]:

    
data [ data <5 ]









    Out[92]:






  
    
      
      one
      two
      three
      four
    
  
  
    
      Ohio
        0
        1
        2
        3
    
    
      Colorado
        4
      NaN
      NaN
      NaN
    
    
      Utah
      NaN
      NaN
      NaN
      NaN
    
    
      New York
      NaN
      NaN
      NaN
      NaN
    
  

4 rows × 4 columns



In [93]:

    
data.ix[['Colorado', 'Utah'], [3, 0, 1]]









    Out[93]:






  
    
      
      four
      one
      two
    
  
  
    
      Colorado
        7
       4
       5
    
    
      Utah
       11
       8
       9
    
  

2 rows × 3 columns



In [94]:

    
data









    Out[94]:






  
    
      
      one
      two
      three
      four
    
  
  
    
      Ohio
        0
        1
        2
        3
    
    
      Colorado
        4
        5
        6
        7
    
    
      Utah
        8
        9
       10
       11
    
    
      New York
       12
       13
       14
       15
    
  

4 rows × 4 columns



In [95]:

    
data.ix[:'Utah', 'two']









    Out[95]:





Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32



In [96]:

    
data.ix[data.three > 5, :3]









    Out[96]:






  
    
      
      one
      two
      three
    
  
  
    
      Colorado
        4
        5
        6
    
    
      Utah
        8
        9
       10
    
    
      New York
       12
       13
       14
    
  

3 rows × 3 columns



In [97]:

    
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])



In [98]:

    
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])



In [101]:

    
s2+s1









    Out[101]:





a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64



In [102]:

    
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
    index=['Ohio', 'Texas', 'Colorado'])



In [103]:

    
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon'])



In [104]:

    
df1+df2









    Out[104]:






  
    
      
      b
      c
      d
      e
    
  
  
    
      Colorado
      NaN
      NaN
      NaN
      NaN
    
    
      Ohio
        3
      NaN
        6
      NaN
    
    
      Oregon
      NaN
      NaN
      NaN
      NaN
    
    
      Texas
        9
      NaN
       12
      NaN
    
    
      Utah
      NaN
      NaN
      NaN
      NaN
    
  

5 rows × 4 columns



In [105]:

    
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))



In [106]:

    
df1









    Out[106]:






  
    
      
      a
      b
      c
      d
    
  
  
    
      0
       0
       1
        2
        3
    
    
      1
       4
       5
        6
        7
    
    
      2
       8
       9
       10
       11
    
  

3 rows × 4 columns



In [107]:

    
df2









    Out[107]:






  
    
      
      a
      b
      c
      d
      e
    
  
  
    
      0
        0
        1
        2
        3
        4
    
    
      1
        5
        6
        7
        8
        9
    
    
      2
       10
       11
       12
       13
       14
    
    
      3
       15
       16
       17
       18
       19
    
  

4 rows × 5 columns



In [108]:

    
df1+df2









    Out[108]:






  
    
      
      a
      b
      c
      d
      e
    
  
  
    
      0
        0
        2
        4
        6
      NaN
    
    
      1
        9
       11
       13
       15
      NaN
    
    
      2
       18
       20
       22
       24
      NaN
    
    
      3
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

4 rows × 5 columns



In [109]:

    
df1.add(df2, fill_value=0)









    Out[109]:






  
    
      
      a
      b
      c
      d
      e
    
  
  
    
      0
        0
        2
        4
        6
        4
    
    
      1
        9
       11
       13
       15
        9
    
    
      2
       18
       20
       22
       24
       14
    
    
      3
       15
       16
       17
       18
       19
    
  

4 rows × 5 columns



In [112]:

    
df1.reindex(columns=df2.columns, fill_value=0)









    Out[112]:






  
    
      
      a
      b
      c
      d
      e
    
  
  
    
      0
       0
       1
        2
        3
       0
    
    
      1
       4
       5
        6
        7
       0
    
    
      2
       8
       9
       10
       11
       0
    
  

3 rows × 5 columns



In [111]:

    
df1









    Out[111]:






  
    
      
      a
      b
      c
      d
    
  
  
    
      0
       0
       1
        2
        3
    
    
      1
       4
       5
        6
        7
    
    
      2
       8
       9
       10
       11
    
  

3 rows × 4 columns



In [113]:

    
arr = np.arange(12.).reshape((3, 4))



In [114]:

    
arr[0]









    Out[114]:





array([ 0.,  1.,  2.,  3.])



In [115]:

    
arr - arr[0]









    Out[115]:





array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])



In [116]:

    
arr









    Out[116]:





array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])



In [117]:

    
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon'])



In [118]:

    
frame.ix[0]









    Out[118]:





b    0
d    1
e    2
Name: Utah, dtype: float64



In [119]:

    
frame









    Out[119]:






  
    
      
      b
      d
      e
    
  
  
    
      Utah
       0
        1
        2
    
    
      Ohio
       3
        4
        5
    
    
      Texas
       6
        7
        8
    
    
      Oregon
       9
       10
       11
    
  

4 rows × 3 columns



In [124]:

    
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon'])



In [126]:

    
np.abs(frame)









    Out[126]:






  
    
      
      b
      d
      e
    
  
  
    
      Utah
       0.411465
       0.933316
       1.051063
    
    
      Ohio
       1.247883
       0.034203
       0.628053
    
    
      Texas
       1.347686
       0.963852
       0.791951
    
    
      Oregon
       0.422665
       1.932881
       0.418523
    
  

4 rows × 3 columns



In [127]:

    
f = lambda x: x.max() - x.min()



In [128]:

    
frame.apply(f)









    Out[128]:





b    1.770351
d    2.896733
e    1.679115
dtype: float64



In [129]:

    
frame









    Out[129]:






  
    
      
      b
      d
      e
    
  
  
    
      Utah
       0.411465
      -0.933316
      -1.051063
    
    
      Ohio
       1.247883
       0.034203
       0.628053
    
    
      Texas
       1.347686
      -0.963852
      -0.791951
    
    
      Oregon
      -0.422665
       1.932881
      -0.418523
    
  

4 rows × 3 columns



In [130]:

    
format = lambda x: '%.2f' % x



In [131]:

    
frame.applymap(format)









    Out[131]:






  
    
      
      b
      d
      e
    
  
  
    
      Utah
        0.41
       -0.93
       -1.05
    
    
      Ohio
        1.25
        0.03
        0.63
    
    
      Texas
        1.35
       -0.96
       -0.79
    
    
      Oregon
       -0.42
        1.93
       -0.42
    
  

4 rows × 3 columns



In [132]:

    
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
    columns=['d', 'a', 'b', 'c'])



In [137]:

    
frame.sort_index(axis=1)









    Out[137]:






  
    
      
      a
      b
      c
      d
    
  
  
    
      three
       1
       2
       3
       0
    
    
      one
       5
       6
       7
       4
    
  

2 rows × 4 columns



In [134]:

    
frame









    Out[134]:






  
    
      
      d
      a
      b
      c
    
  
  
    
      three
       0
       1
       2
       3
    
    
      one
       4
       5
       6
       7
    
  

2 rows × 4 columns



In [138]:

    
frame.sort_index(axis=1, ascending=False)









    Out[138]:






  
    
      
      d
      c
      b
      a
    
  
  
    
      three
       0
       3
       2
       1
    
    
      one
       4
       7
       6
       5
    
  

2 rows × 4 columns



In [139]:

    
obj = Series([7, -5, 7, 4, 2, 0, 4])



In [140]:

    
obj.rank()









    Out[140]:





0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64



In [141]:

    
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
    [np.nan, np.nan], [0.75, -1.3]],
    index=['a', 'b', 'c', 'd'],
    columns=['one', 'two'])



In [144]:

    
df.sort(axis=1)









    Out[144]:






  
    
      
      one
      two
    
  
  
    
      a
       1.40
       NaN
    
    
      b
       7.10
      -4.5
    
    
      c
        NaN
       NaN
    
    
      d
       0.75
      -1.3
    
  

4 rows × 2 columns



In [147]:

    
df.sum(axis=1)









    Out[147]:





a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64



In [149]:

    
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
    'Qu2': [2, 3, 1, 2, 3],
    'Qu3': [1, 5, 2, 4, 4]})



In [150]:

    
data









    Out[150]:






  
    
      
      Qu1
      Qu2
      Qu3
    
  
  
    
      0
       1
       2
       1
    
    
      1
       3
       3
       5
    
    
      2
       4
       1
       2
    
    
      3
       3
       2
       4
    
    
      4
       4
       3
       4
    
  

5 rows × 3 columns



In [153]:

    
data.apply(pd.value_counts).fillna(0)









    Out[153]:






  
    
      
      Qu1
      Qu2
      Qu3
    
  
  
    
      1
       1
       1
       1
    
    
      2
       0
       2
       1
    
    
      3
       2
       2
       0
    
    
      4
       2
       0
       2
    
    
      5
       0
       0
       1
    
  

5 rows × 3 columns



In [155]:

    
data.apply(pd.value_counts, axis=1).fillna(0)









    Out[155]:






  
    
      
      1
      2
      3
      4
      5
    
  
  
    
      0
       2
       1
       0
       0
       0
    
    
      1
       0
       0
       2
       0
       1
    
    
      2
       1
       1
       0
       1
       0
    
    
      3
       0
       1
       1
       1
       0
    
    
      4
       0
       0
       1
       2
       0
    
  

5 rows × 5 columns



In [156]:

    
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])



In [157]:

    
string_data.isnull()









    Out[157]:





0    False
1    False
2     True
3    False
dtype: bool



In [158]:

    
string_data.fillna('none')









    Out[158]:





0     aardvark
1    artichoke
2         none
3      avocado
dtype: object



In [159]:

    
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])



In [160]:

    
data.dropna()









    Out[160]:





0    1.0
2    3.5
4    7.0
dtype: float64



In [162]:

    
data[data.notnull()]









    Out[162]:





0    1.0
2    3.5
4    7.0
dtype: float64



In [163]:

    
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
    [NA, NA, NA], [NA, 6.5, 3.]])



In [164]:

    
data









    Out[164]:






  
    
      
      0
      1
      2
    
  
  
    
      0
        1
       6.5
        3
    
    
      1
        1
       NaN
      NaN
    
    
      2
      NaN
       NaN
      NaN
    
    
      3
      NaN
       6.5
        3
    
  

4 rows × 3 columns



In [165]:

    
data.dropna()









    Out[165]:






  
    
      
      0
      1
      2
    
  
  
    
      0
       1
       6.5
       3
    
  

1 rows × 3 columns



In [166]:

    
data.dropna(how='all')









    Out[166]:






  
    
      
      0
      1
      2
    
  
  
    
      0
        1
       6.5
        3
    
    
      1
        1
       NaN
      NaN
    
    
      3
      NaN
       6.5
        3
    
  

3 rows × 3 columns



In [169]:

    
data.dropna(axis=1, how='all')









    Out[169]:






  
    
      
      0
      1
      2
    
  
  
    
      0
        1
       6.5
        3
    
    
      1
        1
       NaN
      NaN
    
    
      2
      NaN
       NaN
      NaN
    
    
      3
      NaN
       6.5
        3
    
  

4 rows × 3 columns



In [170]:

    
df = DataFrame(np.random.randn(7, 3))



In [176]:

    
df.ix[:4, 1] = NA; df.ix[:2, 2] = NA; df.ix[0,0]= NA



In [177]:

    
df









    Out[177]:






  
    
      
      0
      1
      2
    
  
  
    
      0
            NaN
            NaN
            NaN
    
    
      1
      -1.117429
            NaN
            NaN
    
    
      2
      -1.097185
            NaN
            NaN
    
    
      3
       0.351192
            NaN
      -2.874015
    
    
      4
       0.768508
            NaN
       0.101283
    
    
      5
       0.109518
       1.036179
       0.215995
    
    
      6
      -0.865862
       0.459417
       0.180664
    
  

7 rows × 3 columns



In [179]:

    
df.dropna(axis=0, how='all')









    Out[179]:






  
    
      
      0
      1
      2
    
  
  
    
      1
      -1.117429
            NaN
            NaN
    
    
      2
      -1.097185
            NaN
            NaN
    
    
      3
       0.351192
            NaN
      -2.874015
    
    
      4
       0.768508
            NaN
       0.101283
    
    
      5
       0.109518
       1.036179
       0.215995
    
    
      6
      -0.865862
       0.459417
       0.180664
    
  

6 rows × 3 columns



In [182]:

    
df.dropna(thresh=1)









    Out[182]:






  
    
      
      0
      1
      2
    
  
  
    
      1
      -1.117429
            NaN
            NaN
    
    
      2
      -1.097185
            NaN
            NaN
    
    
      3
       0.351192
            NaN
      -2.874015
    
    
      4
       0.768508
            NaN
       0.101283
    
    
      5
       0.109518
       1.036179
       0.215995
    
    
      6
      -0.865862
       0.459417
       0.180664
    
  

6 rows × 3 columns



In [ ]:

	pop	state	year
0	1.5	Ohio	2000
1	1.7	Ohio	2001
2	3.6	Ohio	2002
3	2.4	Nevada	2001
4	2.9	Nevada	2002

	one	two	three	four
Ohio	True	True	True	True
Colorado	True	False	False	False
Utah	False	False	False	False
New York	False	False	False	False

	b	c	d	e
Colorado	NaN	NaN	NaN	NaN
Ohio	3	NaN	6	NaN
Oregon	NaN	NaN	NaN	NaN
Texas	9	NaN	12	NaN
Utah	NaN	NaN	NaN	NaN

	b	d	e
Utah	0.411465	0.933316	1.051063
Ohio	1.247883	0.034203	0.628053
Texas	1.347686	0.963852	0.791951
Oregon	0.422665	1.932881	0.418523

	b	d	e
Utah	0.411465	-0.933316	-1.051063
Ohio	1.247883	0.034203	0.628053
Texas	1.347686	-0.963852	-0.791951
Oregon	-0.422665	1.932881	-0.418523

	b	d	e
Utah	0.41	-0.93	-1.05
Ohio	1.25	0.03	0.63
Texas	1.35	-0.96	-0.79
Oregon	-0.42	1.93	-0.42

	0	1	2
0	NaN	NaN	NaN
1	-1.117429	NaN	NaN
2	-1.097185	NaN	NaN
3	0.351192	NaN	-2.874015
4	0.768508	NaN	0.101283
5	0.109518	1.036179	0.215995
6	-0.865862	0.459417	0.180664