In [1]:
import pandas as pd
surveys_df = pd.read_csv("data/surveys.csv",
keep_default_na=False, na_values=[""])
print(surveys_df)
species_df = pd.read_csv("data/species.csv",
keep_default_na=False, na_values=[""])
print(species_df)
record_id month day year plot_id species_id sex hindfoot_length \
0 1 7 16 1977 2 NL M 32.0
1 2 7 16 1977 3 NL M 33.0
2 3 7 16 1977 2 DM F 37.0
3 4 7 16 1977 7 DM M 36.0
4 5 7 16 1977 3 DM M 35.0
5 6 7 16 1977 1 PF M 14.0
6 7 7 16 1977 2 PE F NaN
7 8 7 16 1977 1 DM M 37.0
8 9 7 16 1977 1 DM F 34.0
9 10 7 16 1977 6 PF F 20.0
10 11 7 16 1977 5 DS F 53.0
11 12 7 16 1977 7 DM M 38.0
12 13 7 16 1977 3 DM M 35.0
13 14 7 16 1977 8 DM NaN NaN
14 15 7 16 1977 6 DM F 36.0
15 16 7 16 1977 4 DM F 36.0
16 17 7 16 1977 3 DS F 48.0
17 18 7 16 1977 2 PP M 22.0
18 19 7 16 1977 4 PF NaN NaN
19 20 7 17 1977 11 DS F 48.0
20 21 7 17 1977 14 DM F 34.0
21 22 7 17 1977 15 NL F 31.0
22 23 7 17 1977 13 DM M 36.0
23 24 7 17 1977 13 SH M 21.0
24 25 7 17 1977 9 DM M 35.0
25 26 7 17 1977 15 DM M 31.0
26 27 7 17 1977 15 DM M 36.0
27 28 7 17 1977 11 DM M 38.0
28 29 7 17 1977 11 PP M NaN
29 30 7 17 1977 10 DS F 52.0
... ... ... ... ... ... ... ... ...
35519 35520 12 31 2002 9 SF NaN 24.0
35520 35521 12 31 2002 9 DM M 37.0
35521 35522 12 31 2002 9 DM F 35.0
35522 35523 12 31 2002 9 DM F 36.0
35523 35524 12 31 2002 9 PB F 25.0
35524 35525 12 31 2002 9 OL M 21.0
35525 35526 12 31 2002 8 OT F 20.0
35526 35527 12 31 2002 13 DO F 33.0
35527 35528 12 31 2002 13 US NaN NaN
35528 35529 12 31 2002 13 PB F 25.0
35529 35530 12 31 2002 13 OT F 20.0
35530 35531 12 31 2002 13 PB F 27.0
35531 35532 12 31 2002 14 DM F 34.0
35532 35533 12 31 2002 14 DM F 36.0
35533 35534 12 31 2002 14 DM M 37.0
35534 35535 12 31 2002 14 DM M 37.0
35535 35536 12 31 2002 14 DM F 35.0
35536 35537 12 31 2002 14 DM F 36.0
35537 35538 12 31 2002 15 PB F 26.0
35538 35539 12 31 2002 15 SF M 26.0
35539 35540 12 31 2002 15 PB F 26.0
35540 35541 12 31 2002 15 PB F 24.0
35541 35542 12 31 2002 15 PB F 26.0
35542 35543 12 31 2002 15 PB F 27.0
35543 35544 12 31 2002 15 US NaN NaN
35544 35545 12 31 2002 15 AH NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN
35546 35547 12 31 2002 10 RM F 15.0
35547 35548 12 31 2002 7 DO M 36.0
35548 35549 12 31 2002 5 NaN NaN NaN
weight
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
8 NaN
9 NaN
10 NaN
11 NaN
12 NaN
13 NaN
14 NaN
15 NaN
16 NaN
17 NaN
18 NaN
19 NaN
20 NaN
21 NaN
22 NaN
23 NaN
24 NaN
25 NaN
26 NaN
27 NaN
28 NaN
29 NaN
... ...
35519 36.0
35520 48.0
35521 45.0
35522 44.0
35523 27.0
35524 26.0
35525 24.0
35526 43.0
35527 NaN
35528 25.0
35529 NaN
35530 NaN
35531 43.0
35532 48.0
35533 56.0
35534 53.0
35535 42.0
35536 46.0
35537 31.0
35538 68.0
35539 23.0
35540 31.0
35541 29.0
35542 34.0
35543 NaN
35544 NaN
35545 NaN
35546 14.0
35547 51.0
35548 NaN
[35549 rows x 9 columns]
species_id genus species taxa
0 AB Amphispiza bilineata Bird
1 AH Ammospermophilus harrisi Rodent
2 AS Ammodramus savannarum Bird
3 BA Baiomys taylori Rodent
4 CB Campylorhynchus brunneicapillus Bird
5 CM Calamospiza melanocorys Bird
6 CQ Callipepla squamata Bird
7 CS Crotalus scutalatus Reptile
8 CT Cnemidophorus tigris Reptile
9 CU Cnemidophorus uniparens Reptile
10 CV Crotalus viridis Reptile
11 DM Dipodomys merriami Rodent
12 DO Dipodomys ordii Rodent
13 DS Dipodomys spectabilis Rodent
14 DX Dipodomys sp. Rodent
15 EO Eumeces obsoletus Reptile
16 GS Gambelia silus Reptile
17 NL Neotoma albigula Rodent
18 NX Neotoma sp. Rodent
19 OL Onychomys leucogaster Rodent
20 OT Onychomys torridus Rodent
21 OX Onychomys sp. Rodent
22 PB Chaetodipus baileyi Rodent
23 PC Pipilo chlorurus Bird
24 PE Peromyscus eremicus Rodent
25 PF Perognathus flavus Rodent
26 PG Pooecetes gramineus Bird
27 PH Perognathus hispidus Rodent
28 PI Chaetodipus intermedius Rodent
29 PL Peromyscus leucopus Rodent
30 PM Peromyscus maniculatus Rodent
31 PP Chaetodipus penicillatus Rodent
32 PU Pipilo fuscus Bird
33 PX Chaetodipus sp. Rodent
34 RF Reithrodontomys fulvescens Rodent
35 RM Reithrodontomys megalotis Rodent
36 RO Reithrodontomys montanus Rodent
37 RX Reithrodontomys sp. Rodent
38 SA Sylvilagus audubonii Rabbit
39 SB Spizella breweri Bird
40 SC Sceloporus clarki Reptile
41 SF Sigmodon fulviventer Rodent
42 SH Sigmodon hispidus Rodent
43 SO Sigmodon ochrognathus Rodent
44 SS Spermophilus spilosoma Rodent
45 ST Spermophilus tereticaudus Rodent
46 SU Sceloporus undulatus Reptile
47 SX Sigmodon sp. Rodent
48 UL Lizard sp. Reptile
49 UP Pipilo sp. Bird
50 UR Rodent sp. Rodent
51 US Sparrow sp. Bird
52 ZL Zonotrichia leucophrys Bird
53 ZM Zenaida macroura Bird
In [2]:
survey_sub = surveys_df.head(10)
survey_sub_last10 = surveys_df.tail(10)
survey_sub_last10 = survey_sub_last10.reset_index(drop=True)
survey_sub_last10
Out[2]:
record_id
month
day
year
plot_id
species_id
sex
hindfoot_length
weight
0
35540
12
31
2002
15
PB
F
26.0
23.0
1
35541
12
31
2002
15
PB
F
24.0
31.0
2
35542
12
31
2002
15
PB
F
26.0
29.0
3
35543
12
31
2002
15
PB
F
27.0
34.0
4
35544
12
31
2002
15
US
NaN
NaN
NaN
5
35545
12
31
2002
15
AH
NaN
NaN
NaN
6
35546
12
31
2002
15
AH
NaN
NaN
NaN
7
35547
12
31
2002
10
RM
F
15.0
14.0
8
35548
12
31
2002
7
DO
M
36.0
51.0
9
35549
12
31
2002
5
NaN
NaN
NaN
NaN
In [3]:
vertical_stack = pd.concat([survey_sub, survey_sub_last10], axis=0)
vertical_stack
Out[3]:
record_id
month
day
year
plot_id
species_id
sex
hindfoot_length
weight
0
1
7
16
1977
2
NL
M
32.0
NaN
1
2
7
16
1977
3
NL
M
33.0
NaN
2
3
7
16
1977
2
DM
F
37.0
NaN
3
4
7
16
1977
7
DM
M
36.0
NaN
4
5
7
16
1977
3
DM
M
35.0
NaN
5
6
7
16
1977
1
PF
M
14.0
NaN
6
7
7
16
1977
2
PE
F
NaN
NaN
7
8
7
16
1977
1
DM
M
37.0
NaN
8
9
7
16
1977
1
DM
F
34.0
NaN
9
10
7
16
1977
6
PF
F
20.0
NaN
0
35540
12
31
2002
15
PB
F
26.0
23.0
1
35541
12
31
2002
15
PB
F
24.0
31.0
2
35542
12
31
2002
15
PB
F
26.0
29.0
3
35543
12
31
2002
15
PB
F
27.0
34.0
4
35544
12
31
2002
15
US
NaN
NaN
NaN
5
35545
12
31
2002
15
AH
NaN
NaN
NaN
6
35546
12
31
2002
15
AH
NaN
NaN
NaN
7
35547
12
31
2002
10
RM
F
15.0
14.0
8
35548
12
31
2002
7
DO
M
36.0
51.0
9
35549
12
31
2002
5
NaN
NaN
NaN
NaN
In [4]:
horizontal_stack = pd.concat([survey_sub, survey_sub_last10], axis=1)
horizontal_stack
Out[4]:
record_id
month
day
year
plot_id
species_id
sex
hindfoot_length
weight
record_id
month
day
year
plot_id
species_id
sex
hindfoot_length
weight
0
1
7
16
1977
2
NL
M
32.0
NaN
35540
12
31
2002
15
PB
F
26.0
23.0
1
2
7
16
1977
3
NL
M
33.0
NaN
35541
12
31
2002
15
PB
F
24.0
31.0
2
3
7
16
1977
2
DM
F
37.0
NaN
35542
12
31
2002
15
PB
F
26.0
29.0
3
4
7
16
1977
7
DM
M
36.0
NaN
35543
12
31
2002
15
PB
F
27.0
34.0
4
5
7
16
1977
3
DM
M
35.0
NaN
35544
12
31
2002
15
US
NaN
NaN
NaN
5
6
7
16
1977
1
PF
M
14.0
NaN
35545
12
31
2002
15
AH
NaN
NaN
NaN
6
7
7
16
1977
2
PE
F
NaN
NaN
35546
12
31
2002
15
AH
NaN
NaN
NaN
7
8
7
16
1977
1
DM
M
37.0
NaN
35547
12
31
2002
10
RM
F
15.0
14.0
8
9
7
16
1977
1
DM
F
34.0
NaN
35548
12
31
2002
7
DO
M
36.0
51.0
9
10
7
16
1977
6
PF
F
20.0
NaN
35549
12
31
2002
5
NaN
NaN
NaN
NaN
In [5]:
vertical_stack.to_csv('data/out.csv', index=False)
new_output = pd.read_csv('data/out.csv', keep_default_na=False, na_values=[""])
new_output
Out[5]:
record_id
month
day
year
plot_id
species_id
sex
hindfoot_length
weight
0
1
7
16
1977
2
NL
M
32.0
NaN
1
2
7
16
1977
3
NL
M
33.0
NaN
2
3
7
16
1977
2
DM
F
37.0
NaN
3
4
7
16
1977
7
DM
M
36.0
NaN
4
5
7
16
1977
3
DM
M
35.0
NaN
5
6
7
16
1977
1
PF
M
14.0
NaN
6
7
7
16
1977
2
PE
F
NaN
NaN
7
8
7
16
1977
1
DM
M
37.0
NaN
8
9
7
16
1977
1
DM
F
34.0
NaN
9
10
7
16
1977
6
PF
F
20.0
NaN
10
35540
12
31
2002
15
PB
F
26.0
23.0
11
35541
12
31
2002
15
PB
F
24.0
31.0
12
35542
12
31
2002
15
PB
F
26.0
29.0
13
35543
12
31
2002
15
PB
F
27.0
34.0
14
35544
12
31
2002
15
US
NaN
NaN
NaN
15
35545
12
31
2002
15
AH
NaN
NaN
NaN
16
35546
12
31
2002
15
AH
NaN
NaN
NaN
17
35547
12
31
2002
10
RM
F
15.0
14.0
18
35548
12
31
2002
7
DO
M
36.0
51.0
19
35549
12
31
2002
5
NaN
NaN
NaN
NaN
In [6]:
survey_sub = surveys_df.head(10)
species_sub = pd.read_csv('data/speciesSubset.csv', keep_default_na=False, na_values=[""])
species_sub
Out[6]:
species_id
genus
species
taxa
0
DM
Dipodomys
merriami
Rodent
1
NL
Neotoma
albigula
Rodent
2
PE
Peromyscus
eremicus
Rodent
In [7]:
print(species_sub.columns)
print(survey_sub.columns)
Index(['species_id', 'genus', 'species', 'taxa'], dtype='object')
Index(['record_id', 'month', 'day', 'year', 'plot_id', 'species_id', 'sex',
'hindfoot_length', 'weight'],
dtype='object')
In [8]:
merged_inner = pd.merge(left=survey_sub, right=species_sub, left_on='species_id', right_on='species_id')
print(merged_inner.shape)
merged_inner
(8, 12)
Out[8]:
record_id
month
day
year
plot_id
species_id
sex
hindfoot_length
weight
genus
species
taxa
0
1
7
16
1977
2
NL
M
32.0
NaN
Neotoma
albigula
Rodent
1
2
7
16
1977
3
NL
M
33.0
NaN
Neotoma
albigula
Rodent
2
3
7
16
1977
2
DM
F
37.0
NaN
Dipodomys
merriami
Rodent
3
4
7
16
1977
7
DM
M
36.0
NaN
Dipodomys
merriami
Rodent
4
5
7
16
1977
3
DM
M
35.0
NaN
Dipodomys
merriami
Rodent
5
8
7
16
1977
1
DM
M
37.0
NaN
Dipodomys
merriami
Rodent
6
9
7
16
1977
1
DM
F
34.0
NaN
Dipodomys
merriami
Rodent
7
7
7
16
1977
2
PE
F
NaN
NaN
Peromyscus
eremicus
Rodent
In [9]:
merged_left = pd.merge(left=survey_sub, right=species_sub, how='left', left_on='species_id', right_on='species_id')
merged_left
Out[9]:
record_id
month
day
year
plot_id
species_id
sex
hindfoot_length
weight
genus
species
taxa
0
1
7
16
1977
2
NL
M
32.0
NaN
Neotoma
albigula
Rodent
1
2
7
16
1977
3
NL
M
33.0
NaN
Neotoma
albigula
Rodent
2
3
7
16
1977
2
DM
F
37.0
NaN
Dipodomys
merriami
Rodent
3
4
7
16
1977
7
DM
M
36.0
NaN
Dipodomys
merriami
Rodent
4
5
7
16
1977
3
DM
M
35.0
NaN
Dipodomys
merriami
Rodent
5
6
7
16
1977
1
PF
M
14.0
NaN
NaN
NaN
NaN
6
7
7
16
1977
2
PE
F
NaN
NaN
Peromyscus
eremicus
Rodent
7
8
7
16
1977
1
DM
M
37.0
NaN
Dipodomys
merriami
Rodent
8
9
7
16
1977
1
DM
F
34.0
NaN
Dipodomys
merriami
Rodent
9
10
7
16
1977
6
PF
F
20.0
NaN
NaN
NaN
NaN
In [10]:
surveys_df = pd.read_csv("data/surveys.csv", keep_default_na=False, na_values=[""])
print(sum(surveys_df.groupby('species_id')['record_id'].nunique()))
surveys_df.head()
34786
Out[10]:
record_id
month
day
year
plot_id
species_id
sex
hindfoot_length
weight
0
1
7
16
1977
2
NL
M
32.0
NaN
1
2
7
16
1977
3
NL
M
33.0
NaN
2
3
7
16
1977
2
DM
F
37.0
NaN
3
4
7
16
1977
7
DM
M
36.0
NaN
4
5
7
16
1977
3
DM
M
35.0
NaN
In [11]:
species_df = pd.read_csv("data/species.csv", keep_default_na=False, na_values=[""])
species_df.head()
Out[11]:
species_id
genus
species
taxa
0
AB
Amphispiza
bilineata
Bird
1
AH
Ammospermophilus
harrisi
Rodent
2
AS
Ammodramus
savannarum
Bird
3
BA
Baiomys
taylori
Rodent
4
CB
Campylorhynchus
brunneicapillus
Bird
In [12]:
merged = pd.merge(left=surveys_df, right=species_df, left_on='species_id', right_on='species_id')
print(merged.shape)
merged.head()
(34786, 12)
Out[12]:
record_id
month
day
year
plot_id
species_id
sex
hindfoot_length
weight
genus
species
taxa
0
1
7
16
1977
2
NL
M
32.0
NaN
Neotoma
albigula
Rodent
1
2
7
16
1977
3
NL
M
33.0
NaN
Neotoma
albigula
Rodent
2
22
7
17
1977
15
NL
F
31.0
NaN
Neotoma
albigula
Rodent
3
38
7
17
1977
17
NL
M
33.0
NaN
Neotoma
albigula
Rodent
4
72
8
19
1977
2
NL
M
31.0
NaN
Neotoma
albigula
Rodent
In [13]:
# https://stackoverflow.com/questions/39132742/groupby-value-counts-on-the-dataframe-pandas
taxa_counts_by_plot = merged.groupby(['plot_id', 'taxa']).size().unstack(fill_value=0)
taxa_counts_by_plot.head()
Out[13]:
taxa
Bird
Rabbit
Reptile
Rodent
plot_id
1
11
2
0
1976
2
17
3
1
2170
3
17
5
0
1786
4
5
0
0
1955
5
5
1
0
1144
In [14]:
%matplotlib inline
taxa_counts_by_plot.plot(kind='bar', stacked=True)
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x110577550>
In [15]:
taxa_counts_by_plot_by_sex = merged.groupby(['plot_id', 'sex', 'taxa']).size().unstack(fill_value=0)
taxa_counts_by_plot_by_sex.plot(kind='bar', stacked=True)
# Observation: All taxa with a recoreded sex are rodents, so the following plot is more illustrative.
taxa_counts_by_plot_by_sex = merged.groupby(['plot_id', 'sex']).size().unstack(fill_value=0)
taxa_counts_by_plot_by_sex.plot(kind='bar', stacked=True)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x110adb0f0>
Content source: GT-IDEaS/SkillsWorkshop2017
Similar notebooks: