In [11]:
import pandas as pd
surveys_df = pd.read_csv("surveys.csv",
keep_default_na=False, na_values=[""])
surveys_df
Out[11]:
record_id
month
day
year
plot_id
species_id
sex
hindfoot_length
weight
0
1
7
16
1977
2
NL
M
32.0
NaN
1
2
7
16
1977
3
NL
M
33.0
NaN
2
3
7
16
1977
2
DM
F
37.0
NaN
3
4
7
16
1977
7
DM
M
36.0
NaN
4
5
7
16
1977
3
DM
M
35.0
NaN
5
6
7
16
1977
1
PF
M
14.0
NaN
6
7
7
16
1977
2
PE
F
NaN
NaN
7
8
7
16
1977
1
DM
M
37.0
NaN
8
9
7
16
1977
1
DM
F
34.0
NaN
9
10
7
16
1977
6
PF
F
20.0
NaN
10
11
7
16
1977
5
DS
F
53.0
NaN
11
12
7
16
1977
7
DM
M
38.0
NaN
12
13
7
16
1977
3
DM
M
35.0
NaN
13
14
7
16
1977
8
DM
NaN
NaN
NaN
14
15
7
16
1977
6
DM
F
36.0
NaN
15
16
7
16
1977
4
DM
F
36.0
NaN
16
17
7
16
1977
3
DS
F
48.0
NaN
17
18
7
16
1977
2
PP
M
22.0
NaN
18
19
7
16
1977
4
PF
NaN
NaN
NaN
19
20
7
17
1977
11
DS
F
48.0
NaN
20
21
7
17
1977
14
DM
F
34.0
NaN
21
22
7
17
1977
15
NL
F
31.0
NaN
22
23
7
17
1977
13
DM
M
36.0
NaN
23
24
7
17
1977
13
SH
M
21.0
NaN
24
25
7
17
1977
9
DM
M
35.0
NaN
25
26
7
17
1977
15
DM
M
31.0
NaN
26
27
7
17
1977
15
DM
M
36.0
NaN
27
28
7
17
1977
11
DM
M
38.0
NaN
28
29
7
17
1977
11
PP
M
NaN
NaN
29
30
7
17
1977
10
DS
F
52.0
NaN
...
...
...
...
...
...
...
...
...
...
35519
35520
12
31
2002
9
SF
NaN
24.0
36.0
35520
35521
12
31
2002
9
DM
M
37.0
48.0
35521
35522
12
31
2002
9
DM
F
35.0
45.0
35522
35523
12
31
2002
9
DM
F
36.0
44.0
35523
35524
12
31
2002
9
PB
F
25.0
27.0
35524
35525
12
31
2002
9
OL
M
21.0
26.0
35525
35526
12
31
2002
8
OT
F
20.0
24.0
35526
35527
12
31
2002
13
DO
F
33.0
43.0
35527
35528
12
31
2002
13
US
NaN
NaN
NaN
35528
35529
12
31
2002
13
PB
F
25.0
25.0
35529
35530
12
31
2002
13
OT
F
20.0
NaN
35530
35531
12
31
2002
13
PB
F
27.0
NaN
35531
35532
12
31
2002
14
DM
F
34.0
43.0
35532
35533
12
31
2002
14
DM
F
36.0
48.0
35533
35534
12
31
2002
14
DM
M
37.0
56.0
35534
35535
12
31
2002
14
DM
M
37.0
53.0
35535
35536
12
31
2002
14
DM
F
35.0
42.0
35536
35537
12
31
2002
14
DM
F
36.0
46.0
35537
35538
12
31
2002
15
PB
F
26.0
31.0
35538
35539
12
31
2002
15
SF
M
26.0
68.0
35539
35540
12
31
2002
15
PB
F
26.0
23.0
35540
35541
12
31
2002
15
PB
F
24.0
31.0
35541
35542
12
31
2002
15
PB
F
26.0
29.0
35542
35543
12
31
2002
15
PB
F
27.0
34.0
35543
35544
12
31
2002
15
US
NaN
NaN
NaN
35544
35545
12
31
2002
15
AH
NaN
NaN
NaN
35545
35546
12
31
2002
15
AH
NaN
NaN
NaN
35546
35547
12
31
2002
10
RM
F
15.0
14.0
35547
35548
12
31
2002
7
DO
M
36.0
51.0
35548
35549
12
31
2002
5
NaN
NaN
NaN
NaN
35549 rows × 9 columns
In [4]:
species_df = pd.read_csv("species.csv",
keep_default_na=False, na_values=[""])
species_df
Out[4]:
species_id
genus
species
taxa
0
AB
Amphispiza
bilineata
Bird
1
AH
Ammospermophilus
harrisi
Rodent
2
AS
Ammodramus
savannarum
Bird
3
BA
Baiomys
taylori
Rodent
4
CB
Campylorhynchus
brunneicapillus
Bird
5
CM
Calamospiza
melanocorys
Bird
6
CQ
Callipepla
squamata
Bird
7
CS
Crotalus
scutalatus
Reptile
8
CT
Cnemidophorus
tigris
Reptile
9
CU
Cnemidophorus
uniparens
Reptile
10
CV
Crotalus
viridis
Reptile
11
DM
Dipodomys
merriami
Rodent
12
DO
Dipodomys
ordii
Rodent
13
DS
Dipodomys
spectabilis
Rodent
14
DX
Dipodomys
sp.
Rodent
15
EO
Eumeces
obsoletus
Reptile
16
GS
Gambelia
silus
Reptile
17
NL
Neotoma
albigula
Rodent
18
NX
Neotoma
sp.
Rodent
19
OL
Onychomys
leucogaster
Rodent
20
OT
Onychomys
torridus
Rodent
21
OX
Onychomys
sp.
Rodent
22
PB
Chaetodipus
baileyi
Rodent
23
PC
Pipilo
chlorurus
Bird
24
PE
Peromyscus
eremicus
Rodent
25
PF
Perognathus
flavus
Rodent
26
PG
Pooecetes
gramineus
Bird
27
PH
Perognathus
hispidus
Rodent
28
PI
Chaetodipus
intermedius
Rodent
29
PL
Peromyscus
leucopus
Rodent
30
PM
Peromyscus
maniculatus
Rodent
31
PP
Chaetodipus
penicillatus
Rodent
32
PU
Pipilo
fuscus
Bird
33
PX
Chaetodipus
sp.
Rodent
34
RF
Reithrodontomys
fulvescens
Rodent
35
RM
Reithrodontomys
megalotis
Rodent
36
RO
Reithrodontomys
montanus
Rodent
37
RX
Reithrodontomys
sp.
Rodent
38
SA
Sylvilagus
audubonii
Rabbit
39
SB
Spizella
breweri
Bird
40
SC
Sceloporus
clarki
Reptile
41
SF
Sigmodon
fulviventer
Rodent
42
SH
Sigmodon
hispidus
Rodent
43
SO
Sigmodon
ochrognathus
Rodent
44
SS
Spermophilus
spilosoma
Rodent
45
ST
Spermophilus
tereticaudus
Rodent
46
SU
Sceloporus
undulatus
Reptile
47
SX
Sigmodon
sp.
Rodent
48
UL
Lizard
sp.
Reptile
49
UP
Pipilo
sp.
Bird
50
UR
Rodent
sp.
Rodent
51
US
Sparrow
sp.
Bird
52
ZL
Zonotrichia
leucophrys
Bird
53
ZM
Zenaida
macroura
Bird
In [14]:
merged_inner = pd.merge(left=surveys_df,right=species_df, left_on='species_id', right_on='species_id')
# in this case `species_id` is the only column name in both dataframes, so if we skippd `left_on`
# and `right_on` arguments we would still get the same result
# what's the size of the output data?
merged_inner.shape
merged_inner
Out[14]:
record_id
month
day
year
plot_id
species_id
sex
hindfoot_length
weight
genus
species
taxa
0
1
7
16
1977
2
NL
M
32.0
NaN
Neotoma
albigula
Rodent
1
2
7
16
1977
3
NL
M
33.0
NaN
Neotoma
albigula
Rodent
2
22
7
17
1977
15
NL
F
31.0
NaN
Neotoma
albigula
Rodent
3
38
7
17
1977
17
NL
M
33.0
NaN
Neotoma
albigula
Rodent
4
72
8
19
1977
2
NL
M
31.0
NaN
Neotoma
albigula
Rodent
5
106
8
20
1977
12
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
6
107
8
20
1977
18
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
7
121
8
21
1977
15
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
8
171
9
11
1977
12
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
9
194
9
12
1977
11
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
10
214
9
12
1977
17
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
11
224
9
13
1977
2
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
12
266
10
16
1977
2
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
13
272
10
16
1977
18
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
14
281
10
16
1977
20
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
15
282
10
16
1977
22
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
16
297
10
17
1977
11
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
17
314
10
17
1977
14
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
18
344
10
18
1977
3
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
19
349
11
12
1977
2
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
20
353
11
12
1977
18
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
21
363
11
12
1977
2
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
22
390
11
13
1977
11
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
23
393
11
13
1977
14
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
24
406
11
14
1977
12
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
25
435
12
10
1977
2
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
26
465
12
11
1977
5
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
27
469
12
11
1977
12
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
28
475
12
11
1977
11
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
29
486
12
11
1977
14
NL
NaN
NaN
NaN
Neotoma
albigula
Rodent
...
...
...
...
...
...
...
...
...
...
...
...
...
34756
26557
7
29
1997
7
PL
F
20.0
22.0
Peromyscus
leucopus
Rodent
34757
26701
7
30
1997
3
PL
M
21.0
19.0
Peromyscus
leucopus
Rodent
34758
26787
9
27
1997
7
PL
F
21.0
16.0
Peromyscus
leucopus
Rodent
34759
26889
9
28
1997
16
PL
F
21.0
10.0
Peromyscus
leucopus
Rodent
34760
26921
9
28
1997
5
PL
M
22.0
20.0
Peromyscus
leucopus
Rodent
34761
26931
9
28
1997
10
PL
F
10.0
19.0
Peromyscus
leucopus
Rodent
34762
26934
9
28
1997
10
PL
M
21.0
22.0
Peromyscus
leucopus
Rodent
34763
26966
10
25
1997
7
PL
M
20.0
16.0
Peromyscus
leucopus
Rodent
34764
27185
11
22
1997
7
PL
F
21.0
22.0
Peromyscus
leucopus
Rodent
34765
27386
12
29
1997
3
PL
M
20.0
17.0
Peromyscus
leucopus
Rodent
34766
27393
12
29
1997
15
PL
M
19.0
16.0
Peromyscus
leucopus
Rodent
34767
27481
2
1
1998
15
PL
M
19.0
18.0
Peromyscus
leucopus
Rodent
34768
27538
2
1
1998
10
PL
F
20.0
20.0
Peromyscus
leucopus
Rodent
34769
27792
5
2
1998
7
PL
F
20.0
8.0
Peromyscus
leucopus
Rodent
34770
27801
5
3
1998
15
PL
M
21.0
22.0
Peromyscus
leucopus
Rodent
34771
27898
5
28
1998
23
PL
M
20.0
20.0
Peromyscus
leucopus
Rodent
34772
27957
5
29
1998
9
PL
M
21.0
16.0
Peromyscus
leucopus
Rodent
34773
27975
5
29
1998
10
PL
M
20.0
13.0
Peromyscus
leucopus
Rodent
34774
30499
3
4
2000
24
PL
M
20.0
21.0
Peromyscus
leucopus
Rodent
34775
23435
2
25
1996
13
PX
NaN
NaN
NaN
Chaetodipus
sp.
Rodent
34776
24669
10
13
1996
3
PX
NaN
NaN
NaN
Chaetodipus
sp.
Rodent
34777
24991
2
8
1997
23
PX
F
19.0
20.0
Chaetodipus
sp.
Rodent
34778
25137
2
9
1997
13
PX
M
20.0
18.0
Chaetodipus
sp.
Rodent
34779
28806
11
21
1998
7
PX
NaN
NaN
NaN
Chaetodipus
sp.
Rodent
34780
30986
7
1
2000
7
PX
NaN
NaN
NaN
Chaetodipus
sp.
Rodent
34781
28988
12
23
1998
6
CT
NaN
NaN
NaN
Cnemidophorus
tigris
Reptile
34782
35512
12
31
2002
11
US
NaN
NaN
NaN
Sparrow
sp.
Bird
34783
35513
12
31
2002
11
US
NaN
NaN
NaN
Sparrow
sp.
Bird
34784
35528
12
31
2002
13
US
NaN
NaN
NaN
Sparrow
sp.
Bird
34785
35544
12
31
2002
15
US
NaN
NaN
NaN
Sparrow
sp.
Bird
34786 rows × 12 columns
In [15]:
merged_inner.columns
Out[15]:
Index(['record_id', 'month', 'day', 'year', 'plot_id', 'species_id', 'sex',
'hindfoot_length', 'weight', 'genus', 'species', 'taxa'],
dtype='object')
In [28]:
merged_counts = merged_inner.groupby('sex')['taxa'].count()
%matplotlib inline
merged_counts.plot(kind='bar')
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x119691ba8>
In [37]:
merged_counts = merged_inner.groupby('taxa').count()
merged_counts.plot(kind='bar')
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x115fd03c8>
In [10]:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-10-cbea059e1783> in <module>()
4 # import a small subset of the species data designed for this part of the lesson.
5 # It is stored in the data folder.
----> 6 species_sub = pd.read_csv('data/speciesSubset.csv', keep_default_na=False, na_values=[""])
/Users/apple/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
653 skip_blank_lines=skip_blank_lines)
654
--> 655 return _read(filepath_or_buffer, kwds)
656
657 parser_f.__name__ = name
/Users/apple/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
403
404 # Create the parser.
--> 405 parser = TextFileReader(filepath_or_buffer, **kwds)
406
407 if chunksize or iterator:
/Users/apple/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
762 self.options['has_index_names'] = kwds['has_index_names']
763
--> 764 self._make_engine(self.engine)
765
766 def close(self):
/Users/apple/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
983 def _make_engine(self, engine='c'):
984 if engine == 'c':
--> 985 self._engine = CParserWrapper(self.f, **self.options)
986 else:
987 if engine == 'python':
/Users/apple/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
1603 kwds['allow_leading_cols'] = self.index_col is not False
1604
-> 1605 self._reader = parsers.TextReader(src, **kwds)
1606
1607 # XXX
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)()
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)()
FileNotFoundError: File b'data/speciesSubset.csv' does not exist
In [ ]:
Content source: GT-IDEaS/SkillsWorkshop2017
Similar notebooks: