In [11]:
import pandas as pd
surveys_df = pd.read_csv("surveys.csv",
                         keep_default_na=False, na_values=[""])
surveys_df


Out[11]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
5 6 7 16 1977 1 PF M 14.0 NaN
6 7 7 16 1977 2 PE F NaN NaN
7 8 7 16 1977 1 DM M 37.0 NaN
8 9 7 16 1977 1 DM F 34.0 NaN
9 10 7 16 1977 6 PF F 20.0 NaN
10 11 7 16 1977 5 DS F 53.0 NaN
11 12 7 16 1977 7 DM M 38.0 NaN
12 13 7 16 1977 3 DM M 35.0 NaN
13 14 7 16 1977 8 DM NaN NaN NaN
14 15 7 16 1977 6 DM F 36.0 NaN
15 16 7 16 1977 4 DM F 36.0 NaN
16 17 7 16 1977 3 DS F 48.0 NaN
17 18 7 16 1977 2 PP M 22.0 NaN
18 19 7 16 1977 4 PF NaN NaN NaN
19 20 7 17 1977 11 DS F 48.0 NaN
20 21 7 17 1977 14 DM F 34.0 NaN
21 22 7 17 1977 15 NL F 31.0 NaN
22 23 7 17 1977 13 DM M 36.0 NaN
23 24 7 17 1977 13 SH M 21.0 NaN
24 25 7 17 1977 9 DM M 35.0 NaN
25 26 7 17 1977 15 DM M 31.0 NaN
26 27 7 17 1977 15 DM M 36.0 NaN
27 28 7 17 1977 11 DM M 38.0 NaN
28 29 7 17 1977 11 PP M NaN NaN
29 30 7 17 1977 10 DS F 52.0 NaN
... ... ... ... ... ... ... ... ... ...
35519 35520 12 31 2002 9 SF NaN 24.0 36.0
35520 35521 12 31 2002 9 DM M 37.0 48.0
35521 35522 12 31 2002 9 DM F 35.0 45.0
35522 35523 12 31 2002 9 DM F 36.0 44.0
35523 35524 12 31 2002 9 PB F 25.0 27.0
35524 35525 12 31 2002 9 OL M 21.0 26.0
35525 35526 12 31 2002 8 OT F 20.0 24.0
35526 35527 12 31 2002 13 DO F 33.0 43.0
35527 35528 12 31 2002 13 US NaN NaN NaN
35528 35529 12 31 2002 13 PB F 25.0 25.0
35529 35530 12 31 2002 13 OT F 20.0 NaN
35530 35531 12 31 2002 13 PB F 27.0 NaN
35531 35532 12 31 2002 14 DM F 34.0 43.0
35532 35533 12 31 2002 14 DM F 36.0 48.0
35533 35534 12 31 2002 14 DM M 37.0 56.0
35534 35535 12 31 2002 14 DM M 37.0 53.0
35535 35536 12 31 2002 14 DM F 35.0 42.0
35536 35537 12 31 2002 14 DM F 36.0 46.0
35537 35538 12 31 2002 15 PB F 26.0 31.0
35538 35539 12 31 2002 15 SF M 26.0 68.0
35539 35540 12 31 2002 15 PB F 26.0 23.0
35540 35541 12 31 2002 15 PB F 24.0 31.0
35541 35542 12 31 2002 15 PB F 26.0 29.0
35542 35543 12 31 2002 15 PB F 27.0 34.0
35543 35544 12 31 2002 15 US NaN NaN NaN
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0
35548 35549 12 31 2002 5 NaN NaN NaN NaN

35549 rows × 9 columns


In [4]:
species_df = pd.read_csv("species.csv",
                         keep_default_na=False, na_values=[""])
species_df


Out[4]:
species_id genus species taxa
0 AB Amphispiza bilineata Bird
1 AH Ammospermophilus harrisi Rodent
2 AS Ammodramus savannarum Bird
3 BA Baiomys taylori Rodent
4 CB Campylorhynchus brunneicapillus Bird
5 CM Calamospiza melanocorys Bird
6 CQ Callipepla squamata Bird
7 CS Crotalus scutalatus Reptile
8 CT Cnemidophorus tigris Reptile
9 CU Cnemidophorus uniparens Reptile
10 CV Crotalus viridis Reptile
11 DM Dipodomys merriami Rodent
12 DO Dipodomys ordii Rodent
13 DS Dipodomys spectabilis Rodent
14 DX Dipodomys sp. Rodent
15 EO Eumeces obsoletus Reptile
16 GS Gambelia silus Reptile
17 NL Neotoma albigula Rodent
18 NX Neotoma sp. Rodent
19 OL Onychomys leucogaster Rodent
20 OT Onychomys torridus Rodent
21 OX Onychomys sp. Rodent
22 PB Chaetodipus baileyi Rodent
23 PC Pipilo chlorurus Bird
24 PE Peromyscus eremicus Rodent
25 PF Perognathus flavus Rodent
26 PG Pooecetes gramineus Bird
27 PH Perognathus hispidus Rodent
28 PI Chaetodipus intermedius Rodent
29 PL Peromyscus leucopus Rodent
30 PM Peromyscus maniculatus Rodent
31 PP Chaetodipus penicillatus Rodent
32 PU Pipilo fuscus Bird
33 PX Chaetodipus sp. Rodent
34 RF Reithrodontomys fulvescens Rodent
35 RM Reithrodontomys megalotis Rodent
36 RO Reithrodontomys montanus Rodent
37 RX Reithrodontomys sp. Rodent
38 SA Sylvilagus audubonii Rabbit
39 SB Spizella breweri Bird
40 SC Sceloporus clarki Reptile
41 SF Sigmodon fulviventer Rodent
42 SH Sigmodon hispidus Rodent
43 SO Sigmodon ochrognathus Rodent
44 SS Spermophilus spilosoma Rodent
45 ST Spermophilus tereticaudus Rodent
46 SU Sceloporus undulatus Reptile
47 SX Sigmodon sp. Rodent
48 UL Lizard sp. Reptile
49 UP Pipilo sp. Bird
50 UR Rodent sp. Rodent
51 US Sparrow sp. Bird
52 ZL Zonotrichia leucophrys Bird
53 ZM Zenaida macroura Bird

In [14]:
merged_inner = pd.merge(left=surveys_df,right=species_df, left_on='species_id', right_on='species_id')
# in this case `species_id` is the only column name in  both dataframes, so if we skippd `left_on`
# and `right_on` arguments we would still get the same result

# what's the size of the output data?
merged_inner.shape
merged_inner


Out[14]:
record_id month day year plot_id species_id sex hindfoot_length weight genus species taxa
0 1 7 16 1977 2 NL M 32.0 NaN Neotoma albigula Rodent
1 2 7 16 1977 3 NL M 33.0 NaN Neotoma albigula Rodent
2 22 7 17 1977 15 NL F 31.0 NaN Neotoma albigula Rodent
3 38 7 17 1977 17 NL M 33.0 NaN Neotoma albigula Rodent
4 72 8 19 1977 2 NL M 31.0 NaN Neotoma albigula Rodent
5 106 8 20 1977 12 NL NaN NaN NaN Neotoma albigula Rodent
6 107 8 20 1977 18 NL NaN NaN NaN Neotoma albigula Rodent
7 121 8 21 1977 15 NL NaN NaN NaN Neotoma albigula Rodent
8 171 9 11 1977 12 NL NaN NaN NaN Neotoma albigula Rodent
9 194 9 12 1977 11 NL NaN NaN NaN Neotoma albigula Rodent
10 214 9 12 1977 17 NL NaN NaN NaN Neotoma albigula Rodent
11 224 9 13 1977 2 NL NaN NaN NaN Neotoma albigula Rodent
12 266 10 16 1977 2 NL NaN NaN NaN Neotoma albigula Rodent
13 272 10 16 1977 18 NL NaN NaN NaN Neotoma albigula Rodent
14 281 10 16 1977 20 NL NaN NaN NaN Neotoma albigula Rodent
15 282 10 16 1977 22 NL NaN NaN NaN Neotoma albigula Rodent
16 297 10 17 1977 11 NL NaN NaN NaN Neotoma albigula Rodent
17 314 10 17 1977 14 NL NaN NaN NaN Neotoma albigula Rodent
18 344 10 18 1977 3 NL NaN NaN NaN Neotoma albigula Rodent
19 349 11 12 1977 2 NL NaN NaN NaN Neotoma albigula Rodent
20 353 11 12 1977 18 NL NaN NaN NaN Neotoma albigula Rodent
21 363 11 12 1977 2 NL NaN NaN NaN Neotoma albigula Rodent
22 390 11 13 1977 11 NL NaN NaN NaN Neotoma albigula Rodent
23 393 11 13 1977 14 NL NaN NaN NaN Neotoma albigula Rodent
24 406 11 14 1977 12 NL NaN NaN NaN Neotoma albigula Rodent
25 435 12 10 1977 2 NL NaN NaN NaN Neotoma albigula Rodent
26 465 12 11 1977 5 NL NaN NaN NaN Neotoma albigula Rodent
27 469 12 11 1977 12 NL NaN NaN NaN Neotoma albigula Rodent
28 475 12 11 1977 11 NL NaN NaN NaN Neotoma albigula Rodent
29 486 12 11 1977 14 NL NaN NaN NaN Neotoma albigula Rodent
... ... ... ... ... ... ... ... ... ... ... ... ...
34756 26557 7 29 1997 7 PL F 20.0 22.0 Peromyscus leucopus Rodent
34757 26701 7 30 1997 3 PL M 21.0 19.0 Peromyscus leucopus Rodent
34758 26787 9 27 1997 7 PL F 21.0 16.0 Peromyscus leucopus Rodent
34759 26889 9 28 1997 16 PL F 21.0 10.0 Peromyscus leucopus Rodent
34760 26921 9 28 1997 5 PL M 22.0 20.0 Peromyscus leucopus Rodent
34761 26931 9 28 1997 10 PL F 10.0 19.0 Peromyscus leucopus Rodent
34762 26934 9 28 1997 10 PL M 21.0 22.0 Peromyscus leucopus Rodent
34763 26966 10 25 1997 7 PL M 20.0 16.0 Peromyscus leucopus Rodent
34764 27185 11 22 1997 7 PL F 21.0 22.0 Peromyscus leucopus Rodent
34765 27386 12 29 1997 3 PL M 20.0 17.0 Peromyscus leucopus Rodent
34766 27393 12 29 1997 15 PL M 19.0 16.0 Peromyscus leucopus Rodent
34767 27481 2 1 1998 15 PL M 19.0 18.0 Peromyscus leucopus Rodent
34768 27538 2 1 1998 10 PL F 20.0 20.0 Peromyscus leucopus Rodent
34769 27792 5 2 1998 7 PL F 20.0 8.0 Peromyscus leucopus Rodent
34770 27801 5 3 1998 15 PL M 21.0 22.0 Peromyscus leucopus Rodent
34771 27898 5 28 1998 23 PL M 20.0 20.0 Peromyscus leucopus Rodent
34772 27957 5 29 1998 9 PL M 21.0 16.0 Peromyscus leucopus Rodent
34773 27975 5 29 1998 10 PL M 20.0 13.0 Peromyscus leucopus Rodent
34774 30499 3 4 2000 24 PL M 20.0 21.0 Peromyscus leucopus Rodent
34775 23435 2 25 1996 13 PX NaN NaN NaN Chaetodipus sp. Rodent
34776 24669 10 13 1996 3 PX NaN NaN NaN Chaetodipus sp. Rodent
34777 24991 2 8 1997 23 PX F 19.0 20.0 Chaetodipus sp. Rodent
34778 25137 2 9 1997 13 PX M 20.0 18.0 Chaetodipus sp. Rodent
34779 28806 11 21 1998 7 PX NaN NaN NaN Chaetodipus sp. Rodent
34780 30986 7 1 2000 7 PX NaN NaN NaN Chaetodipus sp. Rodent
34781 28988 12 23 1998 6 CT NaN NaN NaN Cnemidophorus tigris Reptile
34782 35512 12 31 2002 11 US NaN NaN NaN Sparrow sp. Bird
34783 35513 12 31 2002 11 US NaN NaN NaN Sparrow sp. Bird
34784 35528 12 31 2002 13 US NaN NaN NaN Sparrow sp. Bird
34785 35544 12 31 2002 15 US NaN NaN NaN Sparrow sp. Bird

34786 rows × 12 columns


In [15]:
merged_inner.columns


Out[15]:
Index(['record_id', 'month', 'day', 'year', 'plot_id', 'species_id', 'sex',
       'hindfoot_length', 'weight', 'genus', 'species', 'taxa'],
      dtype='object')

In [28]:
merged_counts = merged_inner.groupby('sex')['taxa'].count()
%matplotlib inline
merged_counts.plot(kind='bar')


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x119691ba8>

In [37]:
merged_counts = merged_inner.groupby('taxa').count()
merged_counts.plot(kind='bar')


Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x115fd03c8>

In [10]:



---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-10-cbea059e1783> in <module>()
      4 # import a small subset of the species data designed for this part of the lesson.
      5 # It is stored in the data folder.
----> 6 species_sub = pd.read_csv('data/speciesSubset.csv', keep_default_na=False, na_values=[""])

/Users/apple/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

/Users/apple/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    403 
    404     # Create the parser.
--> 405     parser = TextFileReader(filepath_or_buffer, **kwds)
    406 
    407     if chunksize or iterator:

/Users/apple/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    762             self.options['has_index_names'] = kwds['has_index_names']
    763 
--> 764         self._make_engine(self.engine)
    765 
    766     def close(self):

/Users/apple/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    983     def _make_engine(self, engine='c'):
    984         if engine == 'c':
--> 985             self._engine = CParserWrapper(self.f, **self.options)
    986         else:
    987             if engine == 'python':

/Users/apple/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1603         kwds['allow_leading_cols'] = self.index_col is not False
   1604 
-> 1605         self._reader = parsers.TextReader(src, **kwds)
   1606 
   1607         # XXX

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)()

FileNotFoundError: File b'data/speciesSubset.csv' does not exist

In [ ]: