In [7]:
import pandas as pd
import numpy as np
In [8]:
alcohols = pd.read_csv('world_alcohol.csv')
print(alcohols)
In [9]:
alcohols.head()
Out[9]:
In [10]:
alcohols.loc[1:4]
Out[10]:
In [11]:
alcohols['Year']
Out[11]:
In [12]:
alcohols.Year
Out[12]:
In [13]:
populations = [123, 124, 125, 126, 127, 128]
index = pd.MultiIndex.from_tuples([('China', 2000), ('China', 2001), ('US', 2000), ('US', 2001), ('Canada', 2001), ('Canada', 2004)])
pop = pd.Series(populations, index=index)
pop
Out[13]:
In [14]:
df = pd.DataFrame(np.random.rand(4, 2), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=['data1', 'data2'])
df
Out[14]:
In [15]:
df = pd.Series({('CA', 2000): 1, ('CA', 2001): 2, ('TX', 2000): 3, ('TX', 2002): 4, ('NYC', 2005): 8})
df
Out[15]:
In [27]:
df = pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])
print(df)
df = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])
print(df)
df = pd.MultiIndex.from_product([['a', 'b'], [1, 2]])
print(df)
data = pd.Series(np.random.rand(4), index=df)
print(data)
print(data['a'])
In [33]:
df1 = pd.DataFrame({1: 'a', 2: 'b', 3: 'c'}, index=[1, 2, 3])
print(df1)
df2 = pd.DataFrame({1: 'a1', 4: 'd'}, index=[4])
print(df2)
print(pd.concat([df1, df2]))
print(df1.append(df2))
In [35]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Platform', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_year': [2004, 2008, 2012, 2014]})
print(df1)
print(df2)
print(pd.merge(df1, df2))
In [40]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Platform', 'HR']})
df2 = pd.DataFrame({'name': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_year': [2004, 2008, 2012, 2014]})
print(df1)
print(df2)
print(pd.merge(df1, df2, left_on='employee', right_on='name'))
df1.set_index('employee')
df2.set_index('name')
print(pd.merge(df1, df2, left_index=True, right_index=True))
print(df1.join(df2))
In [42]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['a', 'b', 'c', 'a', 'b', 'c'], 'data1': range(6), 'data2': rng.randint(0, 10, 6)}, columns=['key', 'data1', 'data2'])
print(df)
In [43]:
df.groupby('key').aggregate(['min', np.median, max])
Out[43]:
In [44]:
df.groupby('key').transform(lambda x: x - x.mean())
Out[44]:
In [ ]: