In [8]:
import pandas as pd

In [20]:
surveys_df = pd.read_csv('./data/surveys.csv')

In [22]:
type(surveys_df)


Out[22]:
pandas.core.frame.DataFrame

In [27]:
surveys_df.dtypes


Out[27]:
record_id            int64
month                int64
day                  int64
year                 int64
plot_id              int64
species_id          object
sex                 object
hindfoot_length    float64
weight             float64
dtype: object

In [14]:
surveys_df.tail()


Out[14]:
record_id month day year plot_id species_id sex hindfoot_length weight
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0
35548 35549 12 31 2002 5 NaN NaN NaN NaN

In [32]:
surveys_df.columns


Out[32]:
Index(['record_id', 'month', 'day', 'year', 'plot_id', 'species_id', 'sex',
       'hindfoot_length', 'weight'],
      dtype='object')

In [31]:
surveys_df.shape


Out[31]:
(35549, 9)

In [39]:
surveys_df['species_id'].unique()


Out[39]:
array(['NL', 'DM', 'PF', 'PE', 'DS', 'PP', 'SH', 'OT', 'DO', 'OX', 'SS',
       'OL', 'RM', nan, 'SA', 'PM', 'AH', 'DX', 'AB', 'CB', 'CM', 'CQ',
       'RF', 'PC', 'PG', 'PH', 'PU', 'CV', 'UR', 'UP', 'ZL', 'UL', 'CS',
       'SC', 'BA', 'SF', 'RO', 'AS', 'SO', 'PI', 'ST', 'CU', 'SU', 'RX',
       'PB', 'PL', 'PX', 'CT', 'US'], dtype=object)

In [41]:
surveys_df['species_id'].describe()


Out[41]:
count     34786
unique       48
top          DM
freq      10596
Name: species_id, dtype: object

In [48]:
sorted_data = surveys_df.groupby('sex')
type(sorted_data)


Out[48]:
pandas.core.groupby.DataFrameGroupBy

In [51]:
sorted_data.describe().stack()


Out[51]:
day hindfoot_length month plot_id record_id weight year
sex
F count 15690.000000 14894.000000 15690.000000 15690.000000 15690.000000 15303.000000 15690.000000
mean 16.007138 28.836780 6.583047 11.440854 18036.412046 42.170555 1990.644997
std 8.271144 9.463789 3.367350 6.870684 10423.089000 36.847958 7.598725
min 1.000000 7.000000 1.000000 1.000000 3.000000 4.000000 1977.000000
25% 9.000000 21.000000 4.000000 5.000000 8917.500000 20.000000 1984.000000
50% 16.000000 27.000000 7.000000 12.000000 18075.500000 34.000000 1990.000000
75% 23.000000 36.000000 10.000000 17.000000 27250.000000 46.000000 1997.000000
max 31.000000 64.000000 12.000000 24.000000 35547.000000 274.000000 2002.000000
M count 17348.000000 16476.000000 17348.000000 17348.000000 17348.000000 16879.000000 17348.000000
mean 16.184286 29.709578 6.392668 11.098282 17754.835601 42.995379 1990.480401
std 8.199274 9.629246 3.420806 6.728713 10132.203323 36.184981 7.403655
min 1.000000 2.000000 1.000000 1.000000 1.000000 4.000000 1977.000000
25% 9.000000 21.000000 3.000000 5.000000 8969.750000 20.000000 1984.000000
50% 16.000000 34.000000 6.000000 11.000000 17727.500000 39.000000 1990.000000
75% 23.000000 36.000000 9.000000 17.000000 26454.250000 49.000000 1997.000000
max 31.000000 58.000000 12.000000 24.000000 35548.000000 280.000000 2002.000000

In [52]:
sorted_data.mean()


Out[52]:
record_id month day year plot_id hindfoot_length weight
sex
F 18036.412046 6.583047 16.007138 1990.644997 11.440854 28.836780 42.170555
M 17754.835601 6.392668 16.184286 1990.480401 11.098282 29.709578 42.995379

In [54]:
surveys_df.groupby(['plot_id', 'sex']).describe().stack()


Out[54]:
day hindfoot_length month record_id weight year
plot_id sex
1 F count 848.000000 808.000000 848.000000 848.000000 826.000000 848.000000
mean 15.338443 31.733911 6.597877 18390.384434 46.311138 1990.933962
std 8.325993 8.894939 3.366246 10469.790852 33.240958 7.678171
min 1.000000 14.000000 1.000000 9.000000 5.000000 1977.000000
25% 9.000000 22.000000 4.000000 8783.500000 26.000000 1983.000000
50% 15.000000 34.000000 7.000000 19182.500000 40.000000 1991.000000
75% 22.000000 36.000000 10.000000 27691.750000 50.000000 1998.000000
max 31.000000 64.000000 12.000000 35393.000000 196.000000 2002.000000
M count 1095.000000 1047.000000 1095.000000 1095.000000 1072.000000 1095.000000
mean 15.905936 34.302770 6.121461 17197.740639 55.950560 1990.091324
std 8.053257 8.979955 3.418795 9911.570595 41.035686 7.265208
min 1.000000 12.000000 1.000000 6.000000 4.000000 1977.000000
25% 9.000000 32.000000 3.000000 8638.000000 37.000000 1983.000000
50% 16.000000 36.000000 6.000000 17043.000000 46.000000 1990.000000
75% 23.000000 37.000000 9.000000 25251.500000 54.000000 1997.000000
max 31.000000 57.000000 12.000000 35390.000000 231.000000 2002.000000
2 F count 970.000000 918.000000 970.000000 970.000000 954.000000 970.000000
mean 16.288660 30.161220 6.426804 17714.753608 52.561845 1990.449485
std 8.046509 8.677937 3.537694 10300.015076 45.547697 7.519910
min 1.000000 14.000000 1.000000 3.000000 5.000000 1977.000000
25% 10.000000 21.000000 3.000000 9580.250000 25.000000 1984.000000
50% 16.000000 33.000000 6.000000 18104.500000 40.000000 1990.000000
75% 23.000000 36.000000 10.000000 26586.500000 51.000000 1997.000000
max 31.000000 57.000000 12.000000 35405.000000 274.000000 2002.000000
M count 1144.000000 1077.000000 1144.000000 1144.000000 1114.000000 1144.000000
mean 15.440559 30.353760 6.340035 18085.458042 51.391382 1990.756119
std 8.414667 9.016312 3.623430 10555.331260 46.690887 7.714444
min 1.000000 13.000000 1.000000 1.000000 5.000000 1977.000000
25% 9.000000 21.000000 3.000000 8653.000000 24.000000 1983.000000
50% 15.000000 33.000000 6.000000 19024.500000 42.000000 1991.000000
... ... ... ... ... ... ... ... ...
23 F std 8.776973 6.455268 3.353006 8854.378716 18.933945 6.454297
min 1.000000 14.000000 1.000000 41.000000 8.000000 1977.000000
25% 9.000000 16.000000 4.000000 9536.000000 12.000000 1984.000000
50% 15.000000 18.000000 7.000000 14692.000000 16.000000 1988.000000
75% 24.000000 20.000000 10.000000 21169.000000 23.000000 1993.000000
max 30.000000 52.000000 12.000000 35489.000000 199.000000 2002.000000
M count 207.000000 200.000000 207.000000 207.000000 205.000000 207.000000
mean 16.077295 19.850000 6.391304 17091.338164 18.941463 1989.961353
std 7.955203 5.980496 3.543971 8852.413083 17.979740 6.509027
min 1.000000 14.000000 1.000000 55.000000 4.000000 1977.000000
25% 10.000000 16.000000 3.000000 11274.000000 10.000000 1986.000000
50% 16.000000 18.000000 6.000000 15693.000000 12.000000 1989.000000
75% 23.000000 20.250000 10.000000 24740.500000 22.000000 1996.000000
max 31.000000 50.000000 12.000000 35282.000000 131.000000 2002.000000
24 F count 486.000000 453.000000 486.000000 486.000000 479.000000 486.000000
mean 16.393004 26.993377 6.596708 13702.224280 47.914405 1987.485597
std 8.367578 8.561462 3.327782 8692.118528 49.112574 6.340412
min 1.000000 12.000000 1.000000 1963.000000 6.000000 1979.000000
25% 9.000000 20.000000 4.000000 7024.750000 21.000000 1982.000000
50% 16.000000 22.000000 7.000000 11560.500000 33.000000 1986.000000
75% 24.000000 35.000000 10.000000 19442.000000 44.000000 1991.000000
max 31.000000 52.000000 12.000000 35283.000000 251.000000 2002.000000
M count 485.000000 446.000000 485.000000 485.000000 479.000000 485.000000
mean 16.971134 25.786996 6.360825 15208.136082 39.321503 1988.641237
std 8.431738 8.350303 3.452708 9395.610252 42.003947 6.825992
min 1.000000 12.000000 1.000000 2063.000000 4.000000 1979.000000
25% 10.000000 19.000000 3.000000 6992.000000 17.000000 1982.000000
50% 17.000000 21.000000 6.000000 12918.000000 24.000000 1987.000000
75% 24.000000 35.000000 10.000000 22841.000000 45.000000 1995.000000
max 31.000000 51.000000 12.000000 35479.000000 230.000000 2002.000000

384 rows × 6 columns


In [62]:
surveys_df.groupby('sex').sum().stack()


Out[62]:
sex                 
F    record_id          282991305.0
     month                 103288.0
     day                   251152.0
     year                31233220.0
     plot_id               179507.0
     hindfoot_length       429495.0
     weight                645336.0
M    record_id          308010888.0
     month                 110900.0
     day                   280765.0
     year                34530854.0
     plot_id               192533.0
     hindfoot_length       489495.0
     weight                725719.0
dtype: float64

In [68]:
surveys_df.groupby('species_id').count()['record_id']['DO']


Out[68]:
3027

In [74]:
surveys_df['weight'] * 100


Out[74]:
0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
5           NaN
6           NaN
7           NaN
8           NaN
9           NaN
10          NaN
11          NaN
12          NaN
13          NaN
14          NaN
15          NaN
16          NaN
17          NaN
18          NaN
19          NaN
20          NaN
21          NaN
22          NaN
23          NaN
24          NaN
25          NaN
26          NaN
27          NaN
28          NaN
29          NaN
          ...  
35519    3600.0
35520    4800.0
35521    4500.0
35522    4400.0
35523    2700.0
35524    2600.0
35525    2400.0
35526    4300.0
35527       NaN
35528    2500.0
35529       NaN
35530       NaN
35531    4300.0
35532    4800.0
35533    5600.0
35534    5300.0
35535    4200.0
35536    4600.0
35537    3100.0
35538    6800.0
35539    2300.0
35540    3100.0
35541    2900.0
35542    3400.0
35543       NaN
35544       NaN
35545       NaN
35546    1400.0
35547    5100.0
35548       NaN
Name: weight, Length: 35549, dtype: float64

In [81]:
surveys_df[['sex', 'species_id']]


Out[81]:
sex species_id
0 M NL
1 M NL
2 F DM
3 M DM
4 M DM
5 M PF
6 F PE
7 M DM
8 F DM
9 F PF
10 F DS
11 M DM
12 M DM
13 NaN DM
14 F DM
15 F DM
16 F DS
17 M PP
18 NaN PF
19 F DS
20 F DM
21 F NL
22 M DM
23 M SH
24 M DM
25 M DM
26 M DM
27 M DM
28 M PP
29 F DS
... ... ...
35519 NaN SF
35520 M DM
35521 F DM
35522 F DM
35523 F PB
35524 M OL
35525 F OT
35526 F DO
35527 NaN US
35528 F PB
35529 F OT
35530 F PB
35531 F DM
35532 F DM
35533 M DM
35534 M DM
35535 F DM
35536 F DM
35537 F PB
35538 M SF
35539 F PB
35540 F PB
35541 F PB
35542 F PB
35543 NaN US
35544 NaN AH
35545 NaN AH
35546 F RM
35547 M DO
35548 NaN NaN

35549 rows × 2 columns


In [84]:
surveys_df[['plot_id', 'sex', 'species_id']]


Out[84]:
plot_id sex species_id
0 2 M NL
1 3 M NL
2 2 F DM
3 7 M DM
4 3 M DM
5 1 M PF
6 2 F PE
7 1 M DM
8 1 F DM
9 6 F PF
10 5 F DS
11 7 M DM
12 3 M DM
13 8 NaN DM
14 6 F DM
15 4 F DM
16 3 F DS
17 2 M PP
18 4 NaN PF
19 11 F DS
20 14 F DM
21 15 F NL
22 13 M DM
23 13 M SH
24 9 M DM
25 15 M DM
26 15 M DM
27 11 M DM
28 11 M PP
29 10 F DS
... ... ... ...
35519 9 NaN SF
35520 9 M DM
35521 9 F DM
35522 9 F DM
35523 9 F PB
35524 9 M OL
35525 8 F OT
35526 13 F DO
35527 13 NaN US
35528 13 F PB
35529 13 F OT
35530 13 F PB
35531 14 F DM
35532 14 F DM
35533 14 M DM
35534 14 M DM
35535 14 F DM
35536 14 F DM
35537 15 F PB
35538 15 M SF
35539 15 F PB
35540 15 F PB
35541 15 F PB
35542 15 F PB
35543 15 NaN US
35544 15 NaN AH
35545 15 NaN AH
35546 10 F RM
35547 7 M DO
35548 5 NaN NaN

35549 rows × 3 columns


In [93]:
surveys_df[0:41:5]


Out[93]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
5 6 7 16 1977 1 PF M 14.0 NaN
10 11 7 16 1977 5 DS F 53.0 NaN
15 16 7 16 1977 4 DM F 36.0 NaN
20 21 7 17 1977 14 DM F 34.0 NaN
25 26 7 17 1977 15 DM M 31.0 NaN
30 31 7 17 1977 15 DM F 37.0 NaN
35 36 7 17 1977 16 OT F 22.0 NaN
40 41 7 18 1977 23 DM F 34.0 NaN

In [99]:
surveys_df.iloc[0:3, 1:4]


Out[99]:
month day year
0 7 16 1977
1 7 16 1977
2 7 16 1977

In [100]:
surveys_df.loc[[0, 10], :]


Out[100]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
10 11 7 16 1977 5 DS F 53.0 NaN

In [102]:
surveys_df.loc[100, ['plot_id', 'species_id', 'weight']]


Out[102]:
plot_id        5
species_id    DM
weight        46
Name: 100, dtype: object

In [108]:
surveys_df.loc[100, 'day':'month']


Out[108]:
Series([], Name: 100, dtype: object)

In [119]:
surveys_df[(surveys_df.year == 2002) & (surveys_df.month == 2)]


Out[119]:
record_id month day year plot_id species_id sex hindfoot_length weight
33396 33397 2 9 2002 1 RO M 17.0 8.0
33397 33398 2 9 2002 1 DO F 35.0 35.0
33398 33399 2 9 2002 1 PB M 28.0 48.0
33399 33400 2 9 2002 1 DO M 37.0 60.0
33400 33401 2 9 2002 1 DM M 37.0 47.0
33401 33402 2 9 2002 1 DO M 35.0 40.0
33402 33403 2 9 2002 1 OT M 20.0 26.0
33403 33404 2 9 2002 2 PB M 27.0 54.0
33404 33405 2 9 2002 2 RM M 19.0 8.0
33405 33406 2 9 2002 2 DO F 36.0 49.0
33406 33407 2 9 2002 2 DO M 35.0 52.0
33407 33408 2 9 2002 2 PB M 28.0 41.0
33408 33409 2 9 2002 2 PB M 27.0 49.0
33409 33410 2 9 2002 2 DM F 35.0 46.0
33410 33411 2 9 2002 2 PE M 19.0 25.0
33411 33412 2 9 2002 2 PP M 22.0 17.0
33412 33413 2 9 2002 2 SH F 30.0 57.0
33413 33414 2 9 2002 2 PB F 27.0 31.0
33414 33415 2 9 2002 2 NL M 33.0 120.0
33415 33416 2 9 2002 3 PB M 26.0 44.0
33416 33417 2 9 2002 3 DM M 36.0 48.0
33417 33418 2 9 2002 3 PB F 26.0 26.0
33418 33419 2 9 2002 3 DM M 37.0 54.0
33419 33420 2 9 2002 3 OT F 21.0 24.0
33420 33421 2 9 2002 3 PB F 26.0 28.0
33421 33422 2 9 2002 3 PB M 30.0 55.0
33422 33423 2 9 2002 3 DM M 36.0 49.0
33423 33424 2 9 2002 3 DM F 36.0 47.0
33424 33425 2 9 2002 3 DO F 34.0 51.0
33425 33426 2 9 2002 3 DM M 37.0 52.0
... ... ... ... ... ... ... ... ... ...
33540 33541 2 10 2002 13 PB M 29.0 49.0
33541 33542 2 10 2002 13 PP F 21.0 15.0
33542 33543 2 10 2002 14 DM M 36.0 48.0
33543 33544 2 10 2002 14 AH NaN NaN NaN
33544 33545 2 10 2002 14 DM F 35.0 39.0
33545 33546 2 10 2002 14 OT F 21.0 21.0
33546 33547 2 10 2002 14 DM F 36.0 44.0
33547 33548 2 10 2002 14 DM F 36.0 46.0
33548 33549 2 10 2002 14 NL M 33.0 222.0
33549 33550 2 10 2002 15 PB F 25.0 31.0
33550 33551 2 10 2002 15 RM F 17.0 7.0
33551 33552 2 10 2002 15 AH NaN NaN NaN
33552 33553 2 10 2002 15 AH NaN NaN NaN
33553 33554 2 10 2002 15 RM F 17.0 10.0
33554 33555 2 10 2002 15 PB M 27.0 45.0
33555 33556 2 10 2002 5 RO M 15.0 9.0
33556 33557 2 10 2002 5 RM F 17.0 9.0
33557 33558 2 10 2002 16 PB F 26.0 25.0
33558 33559 2 10 2002 16 DM M 36.0 38.0
33559 33560 2 10 2002 16 DO F 36.0 51.0
33560 33561 2 10 2002 10 RM F 17.0 8.0
33561 33562 2 10 2002 10 DO F 35.0 50.0
33562 33563 2 10 2002 10 DO M 34.0 52.0
33563 33564 2 10 2002 10 DO M 38.0 51.0
33564 33565 2 10 2002 10 RO F 16.0 8.0
33565 33566 2 10 2002 7 DM M 35.0 42.0
33566 33567 2 10 2002 7 DO M 36.0 62.0
33567 33568 2 10 2002 7 DO F 37.0 55.0
33568 33569 2 10 2002 7 DO F 38.0 47.0
33569 33570 2 10 2002 7 DO F 35.0 54.0

174 rows × 9 columns


In [127]:
surveys_df[(surveys_df.year >= 2000) & (surveys_df.year <= 2004)].head()


Out[127]:
record_id month day year plot_id species_id sex hindfoot_length weight
30158 30159 1 8 2000 1 PP F 22.0 17.0
30159 30160 1 8 2000 1 DO M 35.0 53.0
30160 30161 1 8 2000 1 PP F 21.0 17.0
30161 30162 1 8 2000 1 DM M 36.0 50.0
30162 30163 1 8 2000 1 PP M 20.0 16.0

In [132]:
surveys_df[surveys_df['species_id'].isin(['NL', 'DM', 'PF', 'PE'])].head()


Out[132]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN

In [137]:
surveys_df[~pd.isnull(surveys_df).any(axis=1)].head()


Out[137]:
record_id month day year plot_id species_id sex hindfoot_length weight
62 63 8 19 1977 3 DM M 35.0 40.0
63 64 8 19 1977 7 DM M 37.0 48.0
64 65 8 19 1977 4 DM F 34.0 29.0
65 66 8 19 1977 4 DM F 35.0 46.0
66 67 8 19 1977 7 DM M 35.0 36.0

In [166]:
idx = pd.IndexSlice

def clean_date(df):
    return pd.to_datetime(df[['year', 'month', 'day']],
                          errors='cerror')

In [167]:
weights = (surveys_df.
              assign(date = clean_date).
              set_index(['date', 'species_id', 'sex']))['weight']


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-167-d33e73fe7373> in <module>()
      1 weights = (surveys_df.
----> 2               assign(date = clean_date).
      3               set_index(['date', 'species_id', 'sex']))['weight']

/usr/local/lib/python3.6/site-packages/pandas/core/frame.py in assign(self, **kwargs)
   2495         results = {}
   2496         for k, v in kwargs.items():
-> 2497             results[k] = com._apply_if_callable(v, data)
   2498 
   2499         # ... and then assign

/usr/local/lib/python3.6/site-packages/pandas/core/common.py in _apply_if_callable(maybe_callable, obj, **kwargs)
    439     """
    440     if callable(maybe_callable):
--> 441         return maybe_callable(obj, **kwargs)
    442     return maybe_callable
    443 

<ipython-input-166-5817482470a1> in clean_date(df)
      3 def clean_date(df):
      4     return pd.to_datetime(df[['year', 'month', 'day']],
----> 5                           errors='cerror')

/usr/local/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin)
    510         result = Series(values, index=arg.index, name=arg.name)
    511     elif isinstance(arg, (ABCDataFrame, MutableMapping)):
--> 512         result = _assemble_from_unit_mappings(arg, errors=errors)
    513     elif isinstance(arg, ABCIndexClass):
    514         result = _convert_listlike(arg, box, format, name=arg.name)

/usr/local/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _assemble_from_unit_mappings(arg, errors)
    611               coerce(arg[unit_rev['day']]))
    612     try:
--> 613         values = to_datetime(values, format='%Y%m%d', errors=errors)
    614     except (TypeError, ValueError) as e:
    615         raise ValueError("cannot assemble the "

/usr/local/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin)
    507     elif isinstance(arg, ABCSeries):
    508         from pandas import Series
--> 509         values = _convert_listlike(arg._values, False, format)
    510         result = Series(values, index=arg.index, name=arg.name)
    511     elif isinstance(arg, (ABCDataFrame, MutableMapping)):

/usr/local/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
    412                     try:
    413                         result = tslib.array_strptime(arg, format, exact=exact,
--> 414                                                       errors=errors)
    415                     except tslib.OutOfBoundsDatetime:
    416                         if errors == 'raise':

pandas/_libs/tslib.pyx in pandas._libs.tslib.array_strptime (pandas/_libs/tslib.c:62494)()

AssertionError: 

In [158]:
surveys_df[['hindfoot_length', 'weight']].apply(lambda x: x.mean())


Out[158]:
hindfoot_length    29.287932
weight             42.672428
dtype: float64

In [162]:
def standardize(x):
    return (x - x.mean() / x.std())

surveys_df[['hindfoot_length', 'weight']].apply(standardize).tail()


Out[162]:
hindfoot_length weight
35544 NaN NaN
35545 NaN NaN
35546 11.937933 12.835082
35547 32.937933 49.835082
35548 NaN NaN

In [165]:
(surveys_df[['hindfoot_length', 'weight']].
     apply(standardize).
     applymap(lambda x: '{:03.2f}'.format(x)).tail())


Out[165]:
hindfoot_length weight
35544 nan nan
35545 nan nan
35546 11.94 12.84
35547 32.94 49.84
35548 nan nan

In [ ]: