You are currently looking at version 1.0 of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the Jupyter Notebook FAQ course resource.
In [1]:
import pandas as pd
pd.Series?
In [2]:
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)
Out[2]:
In [3]:
numbers = [1, 2, 3]
pd.Series(numbers)
Out[3]:
In [4]:
animals = ['Tiger', 'Bear', None]
pd.Series(animals)
Out[4]:
In [5]:
numbers = [1, 2, None]
pd.Series(numbers)
Out[5]:
In [6]:
import numpy as np
np.nan == None
Out[6]:
In [7]:
np.nan == np.nan
Out[7]:
In [8]:
np.isnan(np.nan)
Out[8]:
In [9]:
sports = {'Archery': 'Bhutan',
'Golf': 'Scotland',
'Sumo': 'Japan',
'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s
Out[9]:
In [10]:
s.index
Out[10]:
In [11]:
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s
Out[11]:
In [12]:
sports = {'Archery': 'Bhutan',
'Golf': 'Scotland',
'Sumo': 'Japan',
'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s
Out[12]:
In [13]:
sports = {'Archery': 'Bhutan',
'Golf': 'Scotland',
'Sumo': 'Japan',
'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s
Out[13]:
In [14]:
s.iloc[3]
Out[14]:
In [15]:
s.loc['Golf']
Out[15]:
In [16]:
s[3]
Out[16]:
In [17]:
s['Golf']
Out[17]:
In [18]:
sports = {99: 'Bhutan',
100: 'Scotland',
101: 'Japan',
102: 'South Korea'}
s = pd.Series(sports)
In [19]:
s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead
In [20]:
s.iloc[0]
Out[20]:
In [21]:
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s
Out[21]:
In [22]:
total = 0
for item in s:
total+=item
print(total)
In [23]:
import numpy as np
total = np.sum(s)
print(total)
In [24]:
#this creates a big series of random numbers
s = pd.Series(np.random.randint(0,1000,10000))
s.head()
Out[24]:
In [25]:
len(s)
Out[25]:
In [26]:
%%timeit -n 100
summary = 0
for item in s:
summary+=item
In [27]:
%%timeit -n 100
summary = np.sum(s)
In [28]:
s+=2 #adds two to each item in s using broadcasting
s.head()
Out[28]:
In [29]:
for label, value in s.iteritems():
s.set_value(label, value+2)
s.head()
Out[29]:
In [30]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
s.loc[label]= value+2
In [31]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2
In [32]:
s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s
Out[32]:
In [33]:
original_sports = pd.Series({'Archery': 'Bhutan',
'Golf': 'Scotland',
'Sumo': 'Japan',
'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
'Barbados',
'Pakistan',
'England'],
index=['Cricket',
'Cricket',
'Cricket',
'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)
In [34]:
original_sports
Out[34]:
In [35]:
cricket_loving_countries
Out[35]:
In [36]:
all_countries
Out[36]:
In [37]:
all_countries.loc['Cricket']
Out[37]:
In [38]:
import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
'Item Purchased': 'Dog Food',
'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
'Item Purchased': 'Kitty Litter',
'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
'Item Purchased': 'Bird Seed',
'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()
Out[38]:
In [39]:
df.loc['Store 2']
Out[39]:
In [40]:
type(df.loc['Store 2'])
Out[40]:
In [41]:
df.loc['Store 1']
Out[41]:
In [42]:
df.loc['Store 1', 'Cost']
Out[42]:
In [43]:
df.T
Out[43]:
In [47]:
df.T.loc['Cost']
Out[47]:
In [45]:
df['Cost'] # the right and simple way to select a column
Out[45]:
In [50]:
df.loc['Store 1']['Cost'] # Chaining will return copy!! Do not use chaining!!! Use .loc to slice (see below)
Out[50]:
In [49]:
df.loc[:,['Name', 'Cost']]
Out[49]:
In [51]:
df.drop('Store 1') # returns a copy
Out[51]:
In [52]:
df
Out[52]:
In [58]:
copy_df = df.copy()
copy_df1 = copy_df.drop('Store 1', inplace=True)
copy_df1
In [59]:
copy_df
Out[59]:
In [61]:
df
Out[61]:
In [60]:
copy_df.drop?
In [62]:
del copy_df['Name']
copy_df
Out[62]:
In [63]:
df['Location'] = None
df
Out[63]:
In [65]:
costs = df['Cost']
costs
Out[65]:
In [66]:
costs+=2
costs
Out[66]:
In [67]:
df
Out[67]:
In [68]:
!cat olympics.csv
In [69]:
df = pd.read_csv('olympics.csv')
df.head()
Out[69]:
In [70]:
df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)
df.head()
Out[70]:
In [71]:
df.columns
Out[71]:
In [72]:
for col in df.columns:
if col[:2]=='01':
df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
if col[:2]=='02':
df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
if col[:2]=='03':
df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
if col[:1]=='№':
df.rename(columns={col:'#' + col[1:]}, inplace=True)
df.head()
Out[72]:
In [73]:
df['Gold'] > 0
Out[73]:
In [79]:
only_gold = df.where(df['Gold'] > 0) # this won't drop the na data...
only_gold.head()
Out[79]:
In [75]:
only_gold['Gold'].count()
Out[75]:
In [76]:
df['Gold'].count()
Out[76]:
In [77]:
only_gold = only_gold.dropna()
only_gold.head()
Out[77]:
In [80]:
only_gold = df[df['Gold'] > 0] # this will automatically drop the NaN in Gold column, nice...
only_gold.head()
Out[80]:
In [81]:
len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])
Out[81]:
In [82]:
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]
Out[82]:
In [83]:
df.head()
Out[83]:
In [84]:
df['country'] = df.index
df = df.set_index('Gold')
df.head()
Out[84]:
In [85]:
df = df.reset_index()
df.head()
Out[85]:
In [86]:
df = pd.read_csv('census.csv')
df.head()
Out[86]:
In [87]:
df['SUMLEV'].unique()
Out[87]:
In [88]:
df=df[df['SUMLEV'] == 50]
df.head()
Out[88]:
In [89]:
columns_to_keep = ['STNAME',
'CTYNAME',
'BIRTHS2010',
'BIRTHS2011',
'BIRTHS2012',
'BIRTHS2013',
'BIRTHS2014',
'BIRTHS2015',
'POPESTIMATE2010',
'POPESTIMATE2011',
'POPESTIMATE2012',
'POPESTIMATE2013',
'POPESTIMATE2014',
'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()
Out[89]:
In [90]:
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()
Out[90]:
In [91]:
df.loc['Michigan', 'Washtenaw County']
Out[91]:
In [92]:
df.loc[ [('Michigan', 'Washtenaw County'),
('Michigan', 'Wayne County')] ]
Out[92]:
In [93]:
# Homework:
purchase_1 = pd.Series({'Name': 'Chris',
'Item Purchased': 'Dog Food',
'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
'Item Purchased': 'Kitty Litter',
'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
'Item Purchased': 'Bird Seed',
'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df = df.set_index([df.index, 'Name'])
df.index.names = ['Location', 'Name']
df = df.append(pd.Series(data={'Cost': 3.00, 'Item Purchased': 'Kitty Food'}, name=('Store 2', 'Kevyn')))
df
Out[93]:
In [101]:
df = pd.read_csv('log.csv')
df
Out[101]:
In [99]:
df.fillna?
In [102]:
df = df.set_index('time')
df = df.sort_index()
df
Out[102]:
In [104]:
df = df.reset_index()
df = df.set_index(['time', 'user'])
df
Out[104]:
In [105]:
df = df.fillna(method='ffill')
df
Out[105]:
In [ ]: