You are currently looking at version 1.0 of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the Jupyter Notebook FAQ course resource.

The Series Data Structure



In [1]:

    
!pip freeze > requirements.txt



In [2]:

    
import pandas as pd
pd.Series?



In [3]:

    
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)









    Out[3]:





0    Tiger
1     Bear
2    Moose
dtype: object



In [4]:

    
numbers = [1, 2, 3]
pd.Series(numbers)









    Out[4]:





0    1
1    2
2    3
dtype: int64



In [5]:

    
animals = ['Tiger', 'Bear', None]
pd.Series(animals)









    Out[5]:





0    Tiger
1     Bear
2     None
dtype: object



In [6]:

    
numbers = [1, 2, None]
pd.Series(numbers)









    Out[6]:





0    1.0
1    2.0
2    NaN
dtype: float64



In [7]:

    
import numpy as np
np.nan == None









    Out[7]:





False



In [8]:

    
np.nan == np.nan









    Out[8]:





False



In [9]:

    
np.isnan(np.nan)









    Out[9]:





True



In [10]:

    
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s









    Out[10]:





Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object



In [11]:

    
s.index









    Out[11]:





Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')



In [12]:

    
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s









    Out[12]:





India      Tiger
America     Bear
Canada     Moose
dtype: object



In [13]:

    
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s









    Out[13]:





Golf      Scotland
Sumo         Japan
Hockey         NaN
dtype: object

Querying a Series



In [14]:

    
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s









    Out[14]:





Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object



In [15]:

    
s.iloc[3]









    Out[15]:





'South Korea'



In [16]:

    
s.loc['Golf']









    Out[16]:





'Scotland'



In [17]:

    
s[3]









    Out[17]:





'South Korea'



In [18]:

    
s['Golf']









    Out[18]:





'Scotland'



In [19]:

    
sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)



In [20]:

    
s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-20-a5f43d492595> in <module>()
----> 1 s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/core/series.py in __getitem__(self, key)
    599         key = com._apply_if_callable(key, self)
    600         try:
--> 601             result = self.index.get_value(self, key)
    602 
    603             if not is_scalar(result):

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/indexes/base.py in get_value(self, series, key)
   2137         try:
   2138             return self._engine.get_value(s, k,
-> 2139                                           tz=getattr(series.dtype, 'tz', None))
   2140         except KeyError as e1:
   2141             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3338)()

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3041)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4024)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8141)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8085)()

KeyError: 0



In [21]:

    
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s









    Out[21]:





0    100.0
1    120.0
2    101.0
3      3.0
dtype: float64



In [22]:

    
total = 0
for item in s:
    total+=item
print(total)



In [23]:

    
import numpy as np

total = np.sum(s)
print(total)



In [24]:

    
#this creates a big series of random numbers
s = pd.Series(np.random.randint(0,1000,10000))
s.head()









    Out[24]:





0    503
1    183
2    958
3    792
4    364
dtype: int64



In [25]:

    
len(s)









    Out[25]:





10000



In [26]:

    
%%timeit -n 100
summary = 0
for item in s:
    summary+=item









    



100 loops, best of 3: 977 µs per loop



In [27]:

    
%%timeit -n 100
summary = np.sum(s)









    



100 loops, best of 3: 115 µs per loop



In [28]:

    
s+=2 #adds two to each item in s using broadcasting
s.head()









    Out[28]:





0    505
1    185
2    960
3    794
4    366
dtype: int64



In [29]:

    
for label, value in s.iteritems():
    s.set_value(label, value+2)
s.head()









    Out[29]:





0    507
1    187
2    962
3    796
4    368
dtype: int64



In [30]:

    
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
    s.loc[label]= value+2









    



10 loops, best of 3: 723 ms per loop



In [31]:

    
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2









    



10 loops, best of 3: 655 µs per loop



In [32]:

    
s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s









    Out[32]:





0             1
1             2
2             3
Animal    Bears
dtype: object



In [33]:

    
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)



In [34]:

    
original_sports









    Out[34]:





Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object



In [35]:

    
cricket_loving_countries









    Out[35]:





Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object



In [36]:

    
all_countries









    Out[36]:





Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object



In [37]:

    
all_countries.loc['Cricket']









    Out[37]:





Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

The DataFrame Data Structure



In [38]:

    
import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()









    Out[38]:






  
    
      
      Cost
      Item Purchased
      Name
    
  
  
    
      Store 1
      22.5
      Dog Food
      Chris
    
    
      Store 1
      2.5
      Kitty Litter
      Kevyn
    
    
      Store 2
      5.0
      Bird Seed
      Vinod



In [39]:

    
df.loc['Store 2']









    Out[39]:





Cost                      5
Item Purchased    Bird Seed
Name                  Vinod
Name: Store 2, dtype: object



In [40]:

    
type(df.loc['Store 2'])









    Out[40]:





pandas.core.series.Series



In [41]:

    
df.loc['Store 1']









    Out[41]:






  
    
      
      Cost
      Item Purchased
      Name
    
  
  
    
      Store 1
      22.5
      Dog Food
      Chris
    
    
      Store 1
      2.5
      Kitty Litter
      Kevyn



In [42]:

    
df.loc['Store 1', 'Cost']









    Out[42]:





Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64



In [43]:

    
df.T









    Out[43]:






  
    
      
      Store 1
      Store 1
      Store 2
    
  
  
    
      Cost
      22.5
      2.5
      5
    
    
      Item Purchased
      Dog Food
      Kitty Litter
      Bird Seed
    
    
      Name
      Chris
      Kevyn
      Vinod



In [44]:

    
df.T.loc['Cost']









    Out[44]:





Store 1    22.5
Store 1     2.5
Store 2       5
Name: Cost, dtype: object



In [45]:

    
df['Cost']









    Out[45]:





Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64



In [46]:

    
df.loc['Store 1']['Cost']









    Out[46]:





Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64



In [47]:

    
df.loc[:,['Name', 'Cost']]









    Out[47]:






  
    
      
      Name
      Cost
    
  
  
    
      Store 1
      Chris
      22.5
    
    
      Store 1
      Kevyn
      2.5
    
    
      Store 2
      Vinod
      5.0



In [48]:

    
df.drop('Store 1')









    Out[48]:






  
    
      
      Cost
      Item Purchased
      Name
    
  
  
    
      Store 2
      5.0
      Bird Seed
      Vinod



In [49]:

    
df









    Out[49]:






  
    
      
      Cost
      Item Purchased
      Name
    
  
  
    
      Store 1
      22.5
      Dog Food
      Chris
    
    
      Store 1
      2.5
      Kitty Litter
      Kevyn
    
    
      Store 2
      5.0
      Bird Seed
      Vinod



In [50]:

    
copy_df = df.copy()
copy_df = copy_df.drop('Store 1')
copy_df









    Out[50]:






  
    
      
      Cost
      Item Purchased
      Name
    
  
  
    
      Store 2
      5.0
      Bird Seed
      Vinod



In [51]:

    
copy_df.drop?



In [52]:

    
del copy_df['Name']
copy_df









    Out[52]:






  
    
      
      Cost
      Item Purchased
    
  
  
    
      Store 2
      5.0
      Bird Seed



In [53]:

    
df['Location'] = None
df









    Out[53]:






  
    
      
      Cost
      Item Purchased
      Name
      Location
    
  
  
    
      Store 1
      22.5
      Dog Food
      Chris
      None
    
    
      Store 1
      2.5
      Kitty Litter
      Kevyn
      None
    
    
      Store 2
      5.0
      Bird Seed
      Vinod
      None

Dataframe Indexing and Loading



In [54]:

    
costs = df['Cost']
costs









    Out[54]:





Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64



In [55]:

    
costs+=2
costs









    Out[55]:





Store 1    24.5
Store 1     4.5
Store 2     7.0
Name: Cost, dtype: float64



In [56]:

    
df









    Out[56]:






  
    
      
      Cost
      Item Purchased
      Name
      Location
    
  
  
    
      Store 1
      24.5
      Dog Food
      Chris
      None
    
    
      Store 1
      4.5
      Kitty Litter
      Kevyn
      None
    
    
      Store 2
      7.0
      Bird Seed
      Vinod
      None



In [57]:

    
!cat olympics.csv









    



cat: olympics.csv: Arquivo ou diretório não encontrado



In [58]:

    
df = pd.read_csv('olympics.csv')
df.head()









    



---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-58-b50ace67200d> in <module>()
----> 1 df = pd.read_csv('olympics.csv')
      2 df.head()

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    643                     skip_blank_lines=skip_blank_lines)
    644 
--> 645         return _read(filepath_or_buffer, kwds)
    646 
    647     parser_f.__name__ = name

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    386 
    387     # Create the parser.
--> 388     parser = TextFileReader(filepath_or_buffer, **kwds)
    389 
    390     if (nrows is not None) and (chunksize is not None):

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    727             self.options['has_index_names'] = kwds['has_index_names']
    728 
--> 729         self._make_engine(self.engine)
    730 
    731     def close(self):

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    920     def _make_engine(self, engine='c'):
    921         if engine == 'c':
--> 922             self._engine = CParserWrapper(self.f, **self.options)
    923         else:
    924             if engine == 'python':

/home/joao/Projects/axado/venv_ctools/lib/python3.5/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1387         kwds['allow_leading_cols'] = self.index_col is not False
   1388 
-> 1389         self._reader = _parser.TextReader(src, **kwds)
   1390 
   1391         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4019)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:7967)()

FileNotFoundError: File b'olympics.csv' does not exist



In [ ]:

    
df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)
df.head()



In [ ]:

    
df.columns



In [ ]:

    
for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
    if col[:1]=='№':
        df.rename(columns={col:'#' + col[1:]}, inplace=True) 

df.head()

Querying a DataFrame



In [ ]:

    
df['Gold'] > 0



In [ ]:

    
only_gold = df.where(df['Gold'] > 0)
only_gold.head()



In [ ]:

    
only_gold['Gold'].count()



In [ ]:

    
df['Gold'].count()



In [ ]:

    
only_gold = only_gold.dropna()
only_gold.head()



In [ ]:

    
only_gold = df[df['Gold'] > 0]
only_gold.head()



In [ ]:

    
len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])



In [ ]:

    
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]

Indexing Dataframes



In [ ]:

    
df.head()



In [ ]:

    
df['country'] = df.index
df = df.set_index('Gold')
df.head()



In [ ]:

    
df = df.reset_index()
df.head()



In [ ]:

    
df = pd.read_csv('census.csv')
df.head()



In [ ]:

    
df['SUMLEV'].unique()



In [ ]:

    
df=df[df['SUMLEV'] == 50]
df.head()



In [ ]:

    
columns_to_keep = ['STNAME',
                   'CTYNAME',
                   'BIRTHS2010',
                   'BIRTHS2011',
                   'BIRTHS2012',
                   'BIRTHS2013',
                   'BIRTHS2014',
                   'BIRTHS2015',
                   'POPESTIMATE2010',
                   'POPESTIMATE2011',
                   'POPESTIMATE2012',
                   'POPESTIMATE2013',
                   'POPESTIMATE2014',
                   'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()



In [ ]:

    
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()



In [ ]:

    
df.loc['Michigan', 'Washtenaw County']



In [ ]:

    
df.loc[ [('Michigan', 'Washtenaw County'),
         ('Michigan', 'Wayne County')] ]

Missing values



In [ ]:

    
df = pd.read_csv('log.csv')
df



In [ ]:

    
df.fillna?



In [ ]:

    
df = df.set_index('time')
df = df.sort_index()
df



In [ ]:

    
df = df.reset_index()
df = df.set_index(['time', 'user'])
df



In [ ]:

    
df = df.fillna(method='ffill')
df.head()

	Cost	Item Purchased	Name
Store 1	22.5	Dog Food	Chris
Store 1	2.5	Kitty Litter	Kevyn
Store 2	5.0	Bird Seed	Vinod

	Store 1	Store 1	Store 2
Cost	22.5	2.5	5
Item Purchased	Dog Food	Kitty Litter	Bird Seed
Name	Chris	Kevyn	Vinod

	Cost	Item Purchased	Name	Location
Store 1	24.5	Dog Food	Chris	None
Store 1	4.5	Kitty Litter	Kevyn	None
Store 2	7.0	Bird Seed	Vinod	None