In [7]:
import pandas as pd
import numpy as np

In [8]:
alcohols = pd.read_csv('world_alcohol.csv')
print(alcohols)


   Year       WHO region          Country Beverage Types  Display Value
0  1986  Western Pacific         Viet Nam           Wine           0.00
1  1986         Americas          Uruguay          Other           0.50
2  1985           Africa     Cet d'Ivoire           Wine           1.62
3  1986         Americas         Columbia           Beer           4.27
4  1987         Americas  Saint Kitts and           Beer           1.98

In [9]:
alcohols.head()


Out[9]:
Year WHO region Country Beverage Types Display Value
0 1986 Western Pacific Viet Nam Wine 0.00
1 1986 Americas Uruguay Other 0.50
2 1985 Africa Cet d'Ivoire Wine 1.62
3 1986 Americas Columbia Beer 4.27
4 1987 Americas Saint Kitts and Beer 1.98

In [10]:
alcohols.loc[1:4]


Out[10]:
Year WHO region Country Beverage Types Display Value
1 1986 Americas Uruguay Other 0.50
2 1985 Africa Cet d'Ivoire Wine 1.62
3 1986 Americas Columbia Beer 4.27
4 1987 Americas Saint Kitts and Beer 1.98

In [11]:
alcohols['Year']


Out[11]:
0    1986
1    1986
2    1985
3    1986
4    1987
Name: Year, dtype: int64

In [12]:
alcohols.Year


Out[12]:
0    1986
1    1986
2    1985
3    1986
4    1987
Name: Year, dtype: int64

Multi-index


In [13]:
populations = [123, 124, 125, 126, 127, 128]
index = pd.MultiIndex.from_tuples([('China', 2000), ('China', 2001), ('US', 2000), ('US', 2001), ('Canada', 2001), ('Canada', 2004)])
pop = pd.Series(populations, index=index)
pop


Out[13]:
China   2000    123
        2001    124
US      2000    125
        2001    126
Canada  2001    127
        2004    128
dtype: int64

In [14]:
df = pd.DataFrame(np.random.rand(4, 2), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=['data1', 'data2'])
df


Out[14]:
data1 data2
a 1 0.040243 0.026031
2 0.448145 0.356229
b 1 0.491886 0.105252
2 0.516016 0.271970

In [15]:
df = pd.Series({('CA', 2000): 1, ('CA', 2001): 2, ('TX', 2000): 3, ('TX', 2002): 4, ('NYC', 2005): 8})
df


Out[15]:
CA   2000    1
     2001    2
NYC  2005    8
TX   2000    3
     2002    4
dtype: int64

In [27]:
df = pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])
print(df)
df = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])
print(df)
df = pd.MultiIndex.from_product([['a', 'b'], [1, 2]])
print(df)
data = pd.Series(np.random.rand(4), index=df)
print(data)
print(data['a'])


MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
a  1    0.527758
   2    0.186724
b  1    0.932946
   2    0.939352
dtype: float64
1    0.527758
2    0.186724
dtype: float64

concat, join, merge, etc.


In [33]:
df1 = pd.DataFrame({1: 'a', 2: 'b', 3: 'c'}, index=[1, 2, 3])
print(df1)
df2 = pd.DataFrame({1: 'a1', 4: 'd'}, index=[4])
print(df2)
print(pd.concat([df1, df2]))
print(df1.append(df2))


   1  2  3
1  a  b  c
2  a  b  c
3  a  b  c
    1  4
4  a1  d
    1    2    3    4
1   a    b    c  NaN
2   a    b    c  NaN
3   a    b    c  NaN
4  a1  NaN  NaN    d
    1    2    3    4
1   a    b    c  NaN
2   a    b    c  NaN
3   a    b    c  NaN
4  a1  NaN  NaN    d

In [35]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Platform', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_year': [2004, 2008, 2012, 2014]})
print(df1)
print(df2)
print(pd.merge(df1, df2))


  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa     Platform
3      Sue           HR
  employee  hire_year
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014
  employee        group  hire_year
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa     Platform       2004
3      Sue           HR       2014

In [40]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Platform', 'HR']})
df2 = pd.DataFrame({'name': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_year': [2004, 2008, 2012, 2014]})
print(df1)
print(df2)
print(pd.merge(df1, df2, left_on='employee', right_on='name'))
df1.set_index('employee')
df2.set_index('name')
print(pd.merge(df1, df2, left_index=True, right_index=True))
print(df1.join(df2))


  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa     Platform
3      Sue           HR
   hire_year  name
0       2004  Lisa
1       2008   Bob
2       2012  Jake
3       2014   Sue
  employee        group  hire_year  name
0      Bob   Accounting       2008   Bob
1     Jake  Engineering       2012  Jake
2     Lisa     Platform       2004  Lisa
3      Sue           HR       2014   Sue
  employee        group  hire_year  name
0      Bob   Accounting       2004  Lisa
1     Jake  Engineering       2008   Bob
2     Lisa     Platform       2012  Jake
3      Sue           HR       2014   Sue
  employee        group  hire_year  name
0      Bob   Accounting       2004  Lisa
1     Jake  Engineering       2008   Bob
2     Lisa     Platform       2012  Jake
3      Sue           HR       2014   Sue

Group by, aggregation, filter, etc.


In [42]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['a', 'b', 'c', 'a', 'b', 'c'], 'data1': range(6), 'data2': rng.randint(0, 10, 6)}, columns=['key', 'data1', 'data2'])
print(df)


  key  data1  data2
0   a      0      5
1   b      1      0
2   c      2      3
3   a      3      3
4   b      4      7
5   c      5      9

In [43]:
df.groupby('key').aggregate(['min', np.median, max])


Out[43]:
data1 data2
min median max min median max
key
a 0 1.5 3 3 4.0 5
b 1 2.5 4 0 3.5 7
c 2 3.5 5 3 6.0 9

In [44]:
df.groupby('key').transform(lambda x: x - x.mean())


Out[44]:
data1 data2
0 -1.5 1.0
1 -1.5 -3.5
2 -1.5 -3.0
3 1.5 -1.0
4 1.5 3.5
5 1.5 3.0

In [ ]: