In [1]:
import pandas as pd
surveys_df = pd.read_csv("data/surveys.csv",
                         keep_default_na=False, na_values=[""])
print(surveys_df)

species_df = pd.read_csv("data/species.csv",
                         keep_default_na=False, na_values=[""])
print(species_df)


       record_id  month  day  year  plot_id species_id  sex  hindfoot_length  \
0              1      7   16  1977        2         NL    M             32.0   
1              2      7   16  1977        3         NL    M             33.0   
2              3      7   16  1977        2         DM    F             37.0   
3              4      7   16  1977        7         DM    M             36.0   
4              5      7   16  1977        3         DM    M             35.0   
5              6      7   16  1977        1         PF    M             14.0   
6              7      7   16  1977        2         PE    F              NaN   
7              8      7   16  1977        1         DM    M             37.0   
8              9      7   16  1977        1         DM    F             34.0   
9             10      7   16  1977        6         PF    F             20.0   
10            11      7   16  1977        5         DS    F             53.0   
11            12      7   16  1977        7         DM    M             38.0   
12            13      7   16  1977        3         DM    M             35.0   
13            14      7   16  1977        8         DM  NaN              NaN   
14            15      7   16  1977        6         DM    F             36.0   
15            16      7   16  1977        4         DM    F             36.0   
16            17      7   16  1977        3         DS    F             48.0   
17            18      7   16  1977        2         PP    M             22.0   
18            19      7   16  1977        4         PF  NaN              NaN   
19            20      7   17  1977       11         DS    F             48.0   
20            21      7   17  1977       14         DM    F             34.0   
21            22      7   17  1977       15         NL    F             31.0   
22            23      7   17  1977       13         DM    M             36.0   
23            24      7   17  1977       13         SH    M             21.0   
24            25      7   17  1977        9         DM    M             35.0   
25            26      7   17  1977       15         DM    M             31.0   
26            27      7   17  1977       15         DM    M             36.0   
27            28      7   17  1977       11         DM    M             38.0   
28            29      7   17  1977       11         PP    M              NaN   
29            30      7   17  1977       10         DS    F             52.0   
...          ...    ...  ...   ...      ...        ...  ...              ...   
35519      35520     12   31  2002        9         SF  NaN             24.0   
35520      35521     12   31  2002        9         DM    M             37.0   
35521      35522     12   31  2002        9         DM    F             35.0   
35522      35523     12   31  2002        9         DM    F             36.0   
35523      35524     12   31  2002        9         PB    F             25.0   
35524      35525     12   31  2002        9         OL    M             21.0   
35525      35526     12   31  2002        8         OT    F             20.0   
35526      35527     12   31  2002       13         DO    F             33.0   
35527      35528     12   31  2002       13         US  NaN              NaN   
35528      35529     12   31  2002       13         PB    F             25.0   
35529      35530     12   31  2002       13         OT    F             20.0   
35530      35531     12   31  2002       13         PB    F             27.0   
35531      35532     12   31  2002       14         DM    F             34.0   
35532      35533     12   31  2002       14         DM    F             36.0   
35533      35534     12   31  2002       14         DM    M             37.0   
35534      35535     12   31  2002       14         DM    M             37.0   
35535      35536     12   31  2002       14         DM    F             35.0   
35536      35537     12   31  2002       14         DM    F             36.0   
35537      35538     12   31  2002       15         PB    F             26.0   
35538      35539     12   31  2002       15         SF    M             26.0   
35539      35540     12   31  2002       15         PB    F             26.0   
35540      35541     12   31  2002       15         PB    F             24.0   
35541      35542     12   31  2002       15         PB    F             26.0   
35542      35543     12   31  2002       15         PB    F             27.0   
35543      35544     12   31  2002       15         US  NaN              NaN   
35544      35545     12   31  2002       15         AH  NaN              NaN   
35545      35546     12   31  2002       15         AH  NaN              NaN   
35546      35547     12   31  2002       10         RM    F             15.0   
35547      35548     12   31  2002        7         DO    M             36.0   
35548      35549     12   31  2002        5        NaN  NaN              NaN   

       weight  
0         NaN  
1         NaN  
2         NaN  
3         NaN  
4         NaN  
5         NaN  
6         NaN  
7         NaN  
8         NaN  
9         NaN  
10        NaN  
11        NaN  
12        NaN  
13        NaN  
14        NaN  
15        NaN  
16        NaN  
17        NaN  
18        NaN  
19        NaN  
20        NaN  
21        NaN  
22        NaN  
23        NaN  
24        NaN  
25        NaN  
26        NaN  
27        NaN  
28        NaN  
29        NaN  
...       ...  
35519    36.0  
35520    48.0  
35521    45.0  
35522    44.0  
35523    27.0  
35524    26.0  
35525    24.0  
35526    43.0  
35527     NaN  
35528    25.0  
35529     NaN  
35530     NaN  
35531    43.0  
35532    48.0  
35533    56.0  
35534    53.0  
35535    42.0  
35536    46.0  
35537    31.0  
35538    68.0  
35539    23.0  
35540    31.0  
35541    29.0  
35542    34.0  
35543     NaN  
35544     NaN  
35545     NaN  
35546    14.0  
35547    51.0  
35548     NaN  

[35549 rows x 9 columns]
   species_id             genus          species     taxa
0          AB        Amphispiza        bilineata     Bird
1          AH  Ammospermophilus          harrisi   Rodent
2          AS        Ammodramus       savannarum     Bird
3          BA           Baiomys          taylori   Rodent
4          CB   Campylorhynchus  brunneicapillus     Bird
5          CM       Calamospiza      melanocorys     Bird
6          CQ        Callipepla         squamata     Bird
7          CS          Crotalus       scutalatus  Reptile
8          CT     Cnemidophorus           tigris  Reptile
9          CU     Cnemidophorus        uniparens  Reptile
10         CV          Crotalus          viridis  Reptile
11         DM         Dipodomys         merriami   Rodent
12         DO         Dipodomys            ordii   Rodent
13         DS         Dipodomys      spectabilis   Rodent
14         DX         Dipodomys              sp.   Rodent
15         EO           Eumeces        obsoletus  Reptile
16         GS          Gambelia            silus  Reptile
17         NL           Neotoma         albigula   Rodent
18         NX           Neotoma              sp.   Rodent
19         OL         Onychomys      leucogaster   Rodent
20         OT         Onychomys         torridus   Rodent
21         OX         Onychomys              sp.   Rodent
22         PB       Chaetodipus          baileyi   Rodent
23         PC            Pipilo        chlorurus     Bird
24         PE        Peromyscus         eremicus   Rodent
25         PF       Perognathus           flavus   Rodent
26         PG         Pooecetes        gramineus     Bird
27         PH       Perognathus         hispidus   Rodent
28         PI       Chaetodipus      intermedius   Rodent
29         PL        Peromyscus         leucopus   Rodent
30         PM        Peromyscus      maniculatus   Rodent
31         PP       Chaetodipus     penicillatus   Rodent
32         PU            Pipilo           fuscus     Bird
33         PX       Chaetodipus              sp.   Rodent
34         RF   Reithrodontomys       fulvescens   Rodent
35         RM   Reithrodontomys        megalotis   Rodent
36         RO   Reithrodontomys         montanus   Rodent
37         RX   Reithrodontomys              sp.   Rodent
38         SA        Sylvilagus        audubonii   Rabbit
39         SB          Spizella          breweri     Bird
40         SC        Sceloporus           clarki  Reptile
41         SF          Sigmodon      fulviventer   Rodent
42         SH          Sigmodon         hispidus   Rodent
43         SO          Sigmodon     ochrognathus   Rodent
44         SS      Spermophilus        spilosoma   Rodent
45         ST      Spermophilus     tereticaudus   Rodent
46         SU        Sceloporus        undulatus  Reptile
47         SX          Sigmodon              sp.   Rodent
48         UL            Lizard              sp.  Reptile
49         UP            Pipilo              sp.     Bird
50         UR            Rodent              sp.   Rodent
51         US           Sparrow              sp.     Bird
52         ZL       Zonotrichia       leucophrys     Bird
53         ZM           Zenaida         macroura     Bird

In [2]:
survey_sub = surveys_df.head(10)
survey_sub_last10 = surveys_df.tail(10)
survey_sub_last10 = survey_sub_last10.reset_index(drop=True)
survey_sub_last10


Out[2]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 35540 12 31 2002 15 PB F 26.0 23.0
1 35541 12 31 2002 15 PB F 24.0 31.0
2 35542 12 31 2002 15 PB F 26.0 29.0
3 35543 12 31 2002 15 PB F 27.0 34.0
4 35544 12 31 2002 15 US NaN NaN NaN
5 35545 12 31 2002 15 AH NaN NaN NaN
6 35546 12 31 2002 15 AH NaN NaN NaN
7 35547 12 31 2002 10 RM F 15.0 14.0
8 35548 12 31 2002 7 DO M 36.0 51.0
9 35549 12 31 2002 5 NaN NaN NaN NaN

In [3]:
vertical_stack = pd.concat([survey_sub, survey_sub_last10], axis=0)
vertical_stack


Out[3]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
5 6 7 16 1977 1 PF M 14.0 NaN
6 7 7 16 1977 2 PE F NaN NaN
7 8 7 16 1977 1 DM M 37.0 NaN
8 9 7 16 1977 1 DM F 34.0 NaN
9 10 7 16 1977 6 PF F 20.0 NaN
0 35540 12 31 2002 15 PB F 26.0 23.0
1 35541 12 31 2002 15 PB F 24.0 31.0
2 35542 12 31 2002 15 PB F 26.0 29.0
3 35543 12 31 2002 15 PB F 27.0 34.0
4 35544 12 31 2002 15 US NaN NaN NaN
5 35545 12 31 2002 15 AH NaN NaN NaN
6 35546 12 31 2002 15 AH NaN NaN NaN
7 35547 12 31 2002 10 RM F 15.0 14.0
8 35548 12 31 2002 7 DO M 36.0 51.0
9 35549 12 31 2002 5 NaN NaN NaN NaN

In [4]:
horizontal_stack = pd.concat([survey_sub, survey_sub_last10], axis=1)
horizontal_stack


Out[4]:
record_id month day year plot_id species_id sex hindfoot_length weight record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN 35540 12 31 2002 15 PB F 26.0 23.0
1 2 7 16 1977 3 NL M 33.0 NaN 35541 12 31 2002 15 PB F 24.0 31.0
2 3 7 16 1977 2 DM F 37.0 NaN 35542 12 31 2002 15 PB F 26.0 29.0
3 4 7 16 1977 7 DM M 36.0 NaN 35543 12 31 2002 15 PB F 27.0 34.0
4 5 7 16 1977 3 DM M 35.0 NaN 35544 12 31 2002 15 US NaN NaN NaN
5 6 7 16 1977 1 PF M 14.0 NaN 35545 12 31 2002 15 AH NaN NaN NaN
6 7 7 16 1977 2 PE F NaN NaN 35546 12 31 2002 15 AH NaN NaN NaN
7 8 7 16 1977 1 DM M 37.0 NaN 35547 12 31 2002 10 RM F 15.0 14.0
8 9 7 16 1977 1 DM F 34.0 NaN 35548 12 31 2002 7 DO M 36.0 51.0
9 10 7 16 1977 6 PF F 20.0 NaN 35549 12 31 2002 5 NaN NaN NaN NaN

In [5]:
vertical_stack.to_csv('data/out.csv', index=False)
new_output = pd.read_csv('data/out.csv', keep_default_na=False, na_values=[""])
new_output


Out[5]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
5 6 7 16 1977 1 PF M 14.0 NaN
6 7 7 16 1977 2 PE F NaN NaN
7 8 7 16 1977 1 DM M 37.0 NaN
8 9 7 16 1977 1 DM F 34.0 NaN
9 10 7 16 1977 6 PF F 20.0 NaN
10 35540 12 31 2002 15 PB F 26.0 23.0
11 35541 12 31 2002 15 PB F 24.0 31.0
12 35542 12 31 2002 15 PB F 26.0 29.0
13 35543 12 31 2002 15 PB F 27.0 34.0
14 35544 12 31 2002 15 US NaN NaN NaN
15 35545 12 31 2002 15 AH NaN NaN NaN
16 35546 12 31 2002 15 AH NaN NaN NaN
17 35547 12 31 2002 10 RM F 15.0 14.0
18 35548 12 31 2002 7 DO M 36.0 51.0
19 35549 12 31 2002 5 NaN NaN NaN NaN

In [6]:
survey_sub = surveys_df.head(10)
species_sub = pd.read_csv('data/speciesSubset.csv', keep_default_na=False, na_values=[""])
species_sub


Out[6]:
species_id genus species taxa
0 DM Dipodomys merriami Rodent
1 NL Neotoma albigula Rodent
2 PE Peromyscus eremicus Rodent

In [7]:
print(species_sub.columns)
print(survey_sub.columns)


Index(['species_id', 'genus', 'species', 'taxa'], dtype='object')
Index(['record_id', 'month', 'day', 'year', 'plot_id', 'species_id', 'sex',
       'hindfoot_length', 'weight'],
      dtype='object')

In [8]:
merged_inner = pd.merge(left=survey_sub, right=species_sub, left_on='species_id', right_on='species_id')
print(merged_inner.shape)
merged_inner


(8, 12)
Out[8]:
record_id month day year plot_id species_id sex hindfoot_length weight genus species taxa
0 1 7 16 1977 2 NL M 32.0 NaN Neotoma albigula Rodent
1 2 7 16 1977 3 NL M 33.0 NaN Neotoma albigula Rodent
2 3 7 16 1977 2 DM F 37.0 NaN Dipodomys merriami Rodent
3 4 7 16 1977 7 DM M 36.0 NaN Dipodomys merriami Rodent
4 5 7 16 1977 3 DM M 35.0 NaN Dipodomys merriami Rodent
5 8 7 16 1977 1 DM M 37.0 NaN Dipodomys merriami Rodent
6 9 7 16 1977 1 DM F 34.0 NaN Dipodomys merriami Rodent
7 7 7 16 1977 2 PE F NaN NaN Peromyscus eremicus Rodent

In [9]:
merged_left = pd.merge(left=survey_sub, right=species_sub, how='left', left_on='species_id', right_on='species_id')
merged_left


Out[9]:
record_id month day year plot_id species_id sex hindfoot_length weight genus species taxa
0 1 7 16 1977 2 NL M 32.0 NaN Neotoma albigula Rodent
1 2 7 16 1977 3 NL M 33.0 NaN Neotoma albigula Rodent
2 3 7 16 1977 2 DM F 37.0 NaN Dipodomys merriami Rodent
3 4 7 16 1977 7 DM M 36.0 NaN Dipodomys merriami Rodent
4 5 7 16 1977 3 DM M 35.0 NaN Dipodomys merriami Rodent
5 6 7 16 1977 1 PF M 14.0 NaN NaN NaN NaN
6 7 7 16 1977 2 PE F NaN NaN Peromyscus eremicus Rodent
7 8 7 16 1977 1 DM M 37.0 NaN Dipodomys merriami Rodent
8 9 7 16 1977 1 DM F 34.0 NaN Dipodomys merriami Rodent
9 10 7 16 1977 6 PF F 20.0 NaN NaN NaN NaN

Challenge - Distributions


In [10]:
surveys_df = pd.read_csv("data/surveys.csv", keep_default_na=False, na_values=[""])
print(sum(surveys_df.groupby('species_id')['record_id'].nunique()))
surveys_df.head()


34786
Out[10]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN

In [11]:
species_df = pd.read_csv("data/species.csv", keep_default_na=False, na_values=[""])
species_df.head()


Out[11]:
species_id genus species taxa
0 AB Amphispiza bilineata Bird
1 AH Ammospermophilus harrisi Rodent
2 AS Ammodramus savannarum Bird
3 BA Baiomys taylori Rodent
4 CB Campylorhynchus brunneicapillus Bird

In [12]:
merged = pd.merge(left=surveys_df, right=species_df, left_on='species_id', right_on='species_id')
print(merged.shape)
merged.head()


(34786, 12)
Out[12]:
record_id month day year plot_id species_id sex hindfoot_length weight genus species taxa
0 1 7 16 1977 2 NL M 32.0 NaN Neotoma albigula Rodent
1 2 7 16 1977 3 NL M 33.0 NaN Neotoma albigula Rodent
2 22 7 17 1977 15 NL F 31.0 NaN Neotoma albigula Rodent
3 38 7 17 1977 17 NL M 33.0 NaN Neotoma albigula Rodent
4 72 8 19 1977 2 NL M 31.0 NaN Neotoma albigula Rodent

In [13]:
# https://stackoverflow.com/questions/39132742/groupby-value-counts-on-the-dataframe-pandas
taxa_counts_by_plot = merged.groupby(['plot_id', 'taxa']).size().unstack(fill_value=0)
taxa_counts_by_plot.head()


Out[13]:
taxa Bird Rabbit Reptile Rodent
plot_id
1 11 2 0 1976
2 17 3 1 2170
3 17 5 0 1786
4 5 0 0 1955
5 5 1 0 1144

In [14]:
%matplotlib inline
taxa_counts_by_plot.plot(kind='bar', stacked=True)


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x110577550>

In [15]:
taxa_counts_by_plot_by_sex = merged.groupby(['plot_id', 'sex', 'taxa']).size().unstack(fill_value=0)
taxa_counts_by_plot_by_sex.plot(kind='bar', stacked=True)

# Observation: All taxa with a recoreded sex are rodents, so the following plot is more illustrative.
taxa_counts_by_plot_by_sex = merged.groupby(['plot_id', 'sex']).size().unstack(fill_value=0)
taxa_counts_by_plot_by_sex.plot(kind='bar', stacked=True)


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x110adb0f0>