You are currently looking at version 1.0 of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the Jupyter Notebook FAQ course resource.


The Series Data Structure


In [1]:
!pip freeze > requirements.txt

In [2]:
import pandas as pd
pd.Series?

In [3]:
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)


Out[3]:
0    Tiger
1     Bear
2    Moose
dtype: object

In [4]:
numbers = [1, 2, 3]
pd.Series(numbers)


Out[4]:
0    1
1    2
2    3
dtype: int64

In [5]:
animals = ['Tiger', 'Bear', None]
pd.Series(animals)


Out[5]:
0    Tiger
1     Bear
2     None
dtype: object

In [6]:
numbers = [1, 2, None]
pd.Series(numbers)


Out[6]:
0    1.0
1    2.0
2    NaN
dtype: float64

In [7]:
import numpy as np
np.nan == None


Out[7]:
False

In [8]:
np.nan == np.nan


Out[8]:
False

In [9]:
np.isnan(np.nan)


Out[9]:
True

In [10]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s


Out[10]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [11]:
s.index


Out[11]:
Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')

In [12]:
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s


Out[12]:
India      Tiger
America     Bear
Canada     Moose
dtype: object

In [13]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s


Out[13]:
Golf      Scotland
Sumo         Japan
Hockey         NaN
dtype: object

Querying a Series


In [14]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s


Out[14]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [15]:
s.iloc[3]


Out[15]:
'South Korea'

In [16]:
s.loc['Golf']


Out[16]:
'Scotland'

In [17]:
s[3]


Out[17]:
'South Korea'

In [18]:
s['Golf']


Out[18]:
'Scotland'

In [19]:
sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)

In [20]:
s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-20-a5f43d492595> in <module>()
----> 1 s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/core/series.py in __getitem__(self, key)
    599         key = com._apply_if_callable(key, self)
    600         try:
--> 601             result = self.index.get_value(self, key)
    602 
    603             if not is_scalar(result):

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/indexes/base.py in get_value(self, series, key)
   2137         try:
   2138             return self._engine.get_value(s, k,
-> 2139                                           tz=getattr(series.dtype, 'tz', None))
   2140         except KeyError as e1:
   2141             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3338)()

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3041)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)()

KeyError: 0

In [21]:
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s


Out[21]:
0    100.0
1    120.0
2    101.0
3      3.0
dtype: float64

In [22]:
total = 0
for item in s:
    total+=item
print(total)


324.0

In [23]:
import numpy as np

total = np.sum(s)
print(total)


324.0

In [24]:
#this creates a big series of random numbers
s = pd.Series(np.random.randint(0,1000,10000))
s.head()


Out[24]:
0    503
1    183
2    958
3    792
4    364
dtype: int64

In [25]:
len(s)


Out[25]:
10000

In [26]:
%%timeit -n 100
summary = 0
for item in s:
    summary+=item


100 loops, best of 3: 977 µs per loop

In [27]:
%%timeit -n 100
summary = np.sum(s)


100 loops, best of 3: 115 µs per loop

In [28]:
s+=2 #adds two to each item in s using broadcasting
s.head()


Out[28]:
0    505
1    185
2    960
3    794
4    366
dtype: int64

In [29]:
for label, value in s.iteritems():
    s.set_value(label, value+2)
s.head()


Out[29]:
0    507
1    187
2    962
3    796
4    368
dtype: int64

In [30]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
    s.loc[label]= value+2


10 loops, best of 3: 723 ms per loop

In [31]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2


10 loops, best of 3: 655 µs per loop

In [32]:
s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s


Out[32]:
0             1
1             2
2             3
Animal    Bears
dtype: object

In [33]:
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)

In [34]:
original_sports


Out[34]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [35]:
cricket_loving_countries


Out[35]:
Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

In [36]:
all_countries


Out[36]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object

In [37]:
all_countries.loc['Cricket']


Out[37]:
Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

The DataFrame Data Structure


In [38]:
import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()


Out[38]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn
Store 2 5.0 Bird Seed Vinod

In [39]:
df.loc['Store 2']


Out[39]:
Cost                      5
Item Purchased    Bird Seed
Name                  Vinod
Name: Store 2, dtype: object

In [40]:
type(df.loc['Store 2'])


Out[40]:
pandas.core.series.Series

In [41]:
df.loc['Store 1']


Out[41]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn

In [42]:
df.loc['Store 1', 'Cost']


Out[42]:
Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64

In [43]:
df.T


Out[43]:
Store 1 Store 1 Store 2
Cost 22.5 2.5 5
Item Purchased Dog Food Kitty Litter Bird Seed
Name Chris Kevyn Vinod

In [44]:
df.T.loc['Cost']


Out[44]:
Store 1    22.5
Store 1     2.5
Store 2       5
Name: Cost, dtype: object

In [45]:
df['Cost']


Out[45]:
Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64

In [46]:
df.loc['Store 1']['Cost']


Out[46]:
Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64

In [47]:
df.loc[:,['Name', 'Cost']]


Out[47]:
Name Cost
Store 1 Chris 22.5
Store 1 Kevyn 2.5
Store 2 Vinod 5.0

In [48]:
df.drop('Store 1')


Out[48]:
Cost Item Purchased Name
Store 2 5.0 Bird Seed Vinod

In [49]:
df


Out[49]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn
Store 2 5.0 Bird Seed Vinod

In [50]:
copy_df = df.copy()
copy_df = copy_df.drop('Store 1')
copy_df


Out[50]:
Cost Item Purchased Name
Store 2 5.0 Bird Seed Vinod

In [51]:
copy_df.drop?

In [52]:
del copy_df['Name']
copy_df


Out[52]:
Cost Item Purchased
Store 2 5.0 Bird Seed

In [53]:
df['Location'] = None
df


Out[53]:
Cost Item Purchased Name Location
Store 1 22.5 Dog Food Chris None
Store 1 2.5 Kitty Litter Kevyn None
Store 2 5.0 Bird Seed Vinod None

Dataframe Indexing and Loading


In [54]:
costs = df['Cost']
costs


Out[54]:
Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64

In [55]:
costs+=2
costs


Out[55]:
Store 1    24.5
Store 1     4.5
Store 2     7.0
Name: Cost, dtype: float64

In [56]:
df


Out[56]:
Cost Item Purchased Name Location
Store 1 24.5 Dog Food Chris None
Store 1 4.5 Kitty Litter Kevyn None
Store 2 7.0 Bird Seed Vinod None

In [57]:
!cat olympics.csv


cat: olympics.csv: Arquivo ou diretório não encontrado

In [58]:
df = pd.read_csv('olympics.csv')
df.head()


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-58-b50ace67200d> in <module>()
----> 1 df = pd.read_csv('olympics.csv')
      2 df.head()

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    643                     skip_blank_lines=skip_blank_lines)
    644 
--> 645         return _read(filepath_or_buffer, kwds)
    646 
    647     parser_f.__name__ = name

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    386 
    387     # Create the parser.
--> 388     parser = TextFileReader(filepath_or_buffer, **kwds)
    389 
    390     if (nrows is not None) and (chunksize is not None):

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    727             self.options['has_index_names'] = kwds['has_index_names']
    728 
--> 729         self._make_engine(self.engine)
    730 
    731     def close(self):

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    920     def _make_engine(self, engine='c'):
    921         if engine == 'c':
--> 922             self._engine = CParserWrapper(self.f, **self.options)
    923         else:
    924             if engine == 'python':

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1387         kwds['allow_leading_cols'] = self.index_col is not False
   1388 
-> 1389         self._reader = _parser.TextReader(src, **kwds)
   1390 
   1391         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4019)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:7967)()

FileNotFoundError: File b'olympics.csv' does not exist

In [ ]:
df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)
df.head()

In [ ]:
df.columns

In [ ]:
for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
    if col[:1]=='№':
        df.rename(columns={col:'#' + col[1:]}, inplace=True) 

df.head()

Querying a DataFrame


In [ ]:
df['Gold'] > 0

In [ ]:
only_gold = df.where(df['Gold'] > 0)
only_gold.head()

In [ ]:
only_gold['Gold'].count()

In [ ]:
df['Gold'].count()

In [ ]:
only_gold = only_gold.dropna()
only_gold.head()

In [ ]:
only_gold = df[df['Gold'] > 0]
only_gold.head()

In [ ]:
len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])

In [ ]:
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]

Indexing Dataframes


In [ ]:
df.head()

In [ ]:
df['country'] = df.index
df = df.set_index('Gold')
df.head()

In [ ]:
df = df.reset_index()
df.head()

In [ ]:
df = pd.read_csv('census.csv')
df.head()

In [ ]:
df['SUMLEV'].unique()

In [ ]:
df=df[df['SUMLEV'] == 50]
df.head()

In [ ]:
columns_to_keep = ['STNAME',
                   'CTYNAME',
                   'BIRTHS2010',
                   'BIRTHS2011',
                   'BIRTHS2012',
                   'BIRTHS2013',
                   'BIRTHS2014',
                   'BIRTHS2015',
                   'POPESTIMATE2010',
                   'POPESTIMATE2011',
                   'POPESTIMATE2012',
                   'POPESTIMATE2013',
                   'POPESTIMATE2014',
                   'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()

In [ ]:
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()

In [ ]:
df.loc['Michigan', 'Washtenaw County']

In [ ]:
df.loc[ [('Michigan', 'Washtenaw County'),
         ('Michigan', 'Wayne County')] ]

Missing values


In [ ]:
df = pd.read_csv('log.csv')
df

In [ ]:
df.fillna?

In [ ]:
df = df.set_index('time')
df = df.sort_index()
df

In [ ]:
df = df.reset_index()
df = df.set_index(['time', 'user'])
df

In [ ]:
df = df.fillna(method='ffill')
df.head()