Loading/Manipulating excel data with Pandas

Emergence data


In [15]:
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

data_loc = '../data/sampling_details.xlsx'

release_date = pd.Timestamp('2005-03-13')

# load the sentinel fields sheet
sentinel_fields_data = pd.read_excel(data_loc,sheetname='Kal-sentinels-raw')
# rename the headings with spaces in them
sentinel_fields_data.rename(columns={"Field descrip":"descrip","date emerged":"date", 
                                     "Field ID (jpgs)": "id",
                                     "Field ID (paper)":"paperid"}, inplace=True)
sentinel_fields_data.drop('descrip',1,inplace=True)
sentinel_fields_data.drop('paperid',1,inplace=True)
sentinel_fields_data.sort_values(['id','date'], inplace=True)
# get sum of all the emergences
col_list = list(sentinel_fields_data)
for name in ['id','date']:
    col_list.remove(name)
sentinel_fields_data['All_total'] = sentinel_fields_data[col_list].sum(axis=1)
# get the number of E Hayati emergences per day
sentinel_fields_data['E_total'] = sentinel_fields_data[['Efemales','Emales']].sum(axis=1)
sentinel_fields_data['datePR'] = sentinel_fields_data['date'] - release_date
#print(sentinel_fields_data['datePR'].min().days)
#day29 = pd.Timedelta('29 days')
#sentinel_fields_data[sentinel_fields_data['datePR']==day29]['E_total'].values
#PR29 = sentinel_fields_data['datePR'] == day29
#sentinel_fields_data[PR29]
sentinel_fields_data
#sentinel_fields_data[sentinel_fields_data['E_total']>0].plot.scatter(x='Wfnum',y='E_total')


Out[15]:
id date Efemales Emales morpho1 morpho2 morpho3 morpho4 morpho5 morpho6 Wfnum All_total E_total datePR
10 B 2005-04-03 0 0 2 0 0 0 0 0 4 6 0 21 days
11 B 2005-04-05 1 0 2 5 0 0 0 0 41 49 1 23 days
12 B 2005-04-07 4 2 1 0 0 0 0 0 21 28 6 25 days
13 B 2005-04-09 0 0 0 0 0 0 1 0 53 54 0 27 days
14 B 2005-04-11 1 0 0 0 0 0 0 0 11 12 1 29 days
15 B 2005-04-13 2 1 0 0 0 0 1 0 0 4 3 31 days
16 B 2005-04-16 0 0 0 0 0 0 0 0 0 0 0 34 days
17 B 2005-04-18 0 0 0 0 0 0 0 0 0 0 0 36 days
18 B 2005-04-20 0 0 0 0 0 0 0 0 0 0 0 38 days
19 B 2005-04-22 0 0 0 0 0 0 0 0 0 0 0 40 days
20 C 2005-04-03 1 0 5 0 2 1 0 0 6 15 1 21 days
21 C 2005-04-05 0 2 0 0 0 1 0 0 36 39 2 23 days
22 C 2005-04-07 1 0 2 0 0 0 0 0 11 14 1 25 days
23 C 2005-04-09 1 0 0 0 0 0 0 0 24 25 1 27 days
24 C 2005-04-11 0 0 0 0 0 0 0 0 0 0 0 29 days
25 C 2005-04-13 0 2 1 0 0 0 0 0 0 3 2 31 days
26 C 2005-04-16 0 0 0 0 0 0 0 0 0 0 0 34 days
27 C 2005-04-18 2 0 0 0 1 0 0 0 0 3 2 36 days
28 C 2005-04-20 0 0 0 0 0 0 0 0 0 0 0 38 days
29 C 2005-04-22 0 0 0 0 0 0 0 0 0 0 0 40 days
30 D 2005-04-03 1 0 0 0 0 0 0 0 60 61 1 21 days
31 D 2005-04-05 1 0 0 0 0 0 0 0 80 81 1 23 days
32 D 2005-04-07 3 0 0 0 0 0 0 0 44 47 3 25 days
33 D 2005-04-09 0 0 0 0 0 0 0 0 12 12 0 27 days
34 D 2005-04-11 5 3 0 0 0 0 1 0 8 17 8 29 days
35 D 2005-04-13 0 2 0 0 0 0 3 0 1 6 2 31 days
36 D 2005-04-16 0 0 0 0 0 0 0 0 0 0 0 34 days
37 D 2005-04-18 2 2 0 0 0 0 0 0 0 4 4 36 days
38 D 2005-04-20 3 1 0 0 0 0 0 0 8 12 4 38 days
39 D 2005-04-22 0 0 0 0 0 0 0 0 0 0 0 40 days
0 E 2005-04-03 0 0 1 0 0 0 0 0 39 40 0 21 days
1 E 2005-04-05 0 0 0 0 0 0 0 0 155 155 0 23 days
2 E 2005-04-07 1 0 0 1 0 0 0 0 69 71 1 25 days
3 E 2005-04-09 0 1 0 1 0 0 0 0 60 62 1 27 days
4 E 2005-04-11 0 0 0 0 0 0 0 0 0 0 0 29 days
5 E 2005-04-13 1 0 0 0 0 0 1 0 0 2 1 31 days
6 E 2005-04-16 2 2 0 0 0 0 0 0 0 4 4 34 days
7 E 2005-04-18 2 5 0 0 0 0 0 0 0 7 7 36 days
8 E 2005-04-20 0 0 0 0 0 0 0 0 0 0 0 38 days
9 E 2005-04-22 0 0 0 0 0 0 0 0 0 0 0 40 days
40 F 2005-04-03 0 0 0 0 0 0 0 0 0 0 0 21 days
41 F 2005-04-05 0 0 0 0 0 0 0 0 227 227 0 23 days
42 F 2005-04-07 0 0 0 0 0 0 0 0 75 75 0 25 days
43 F 2005-04-09 0 0 0 0 0 0 0 0 44 44 0 27 days
44 F 2005-04-11 0 0 0 0 0 0 0 0 2 2 0 29 days
45 F 2005-04-13 1 0 0 0 0 0 0 0 5 6 1 31 days
46 F 2005-04-16 0 0 0 0 0 0 0 0 0 0 0 34 days
47 F 2005-04-18 0 0 0 0 0 0 0 0 0 0 0 36 days
48 F 2005-04-20 0 0 0 0 0 0 2 0 0 2 0 38 days
49 F 2005-04-22 0 0 0 0 0 0 0 0 0 0 0 40 days
50 G 2005-04-03 0 0 0 0 0 0 0 0 400 400 0 21 days
51 G 2005-04-05 0 0 0 0 0 0 0 0 240 240 0 23 days
52 G 2005-04-07 0 0 0 0 0 0 0 0 61 61 0 25 days
53 G 2005-04-09 0 0 0 0 0 0 0 0 0 0 0 27 days
54 G 2005-04-11 3 0 0 3 0 1 0 0 0 7 3 29 days
55 G 2005-04-13 4 4 0 0 0 0 17 0 2 27 8 31 days
56 G 2005-04-16 0 0 0 0 0 0 0 0 0 0 0 34 days
57 G 2005-04-18 0 0 0 0 0 0 0 0 0 0 0 36 days
58 G 2005-04-20 0 0 0 0 0 0 0 0 0 0 0 38 days
59 G 2005-04-22 0 0 0 0 0 0 0 0 0 0 0 40 days

In [14]:
release_field_data = pd.read_excel(data_loc,sheetname='Kal-releasefield-raw')
# in our data, North was on the left of the grid. So switch coordinates
release_field_data['temp'] = release_field_data['xcoord']
release_field_data['xcoord'] = release_field_data['ycoord']
release_field_data['ycoord'] = -release_field_data['temp'] # need to flip orientation
release_field_data.drop('temp',1,inplace=True)
# put release point at the origin
release_field_data['ycoord'] += 300
release_field_data['xcoord'] -= 200
col_list = list(release_field_data)
for name in ['Field','xcoord','ycoord','date emerged']:
    col_list.remove(name)
release_field_data['All_total'] = release_field_data[col_list].sum(axis=1)
release_field_data['E_total'] = release_field_data[['Efemales','Emales']].sum(axis=1)
release_field_data['datePR'] = release_field_data['date emerged'] - release_date
release_field_data.drop('Field',1,inplace=True)
#release_field_data['ycoord'].values
release_field_data.sort_values(['datePR','xcoord','ycoord'],inplace=True)
print(release_field_data['datePR'].map(lambda t: t.days).unique())
#release_field_data


[21 23 25 27 29 31 34 36 38 40]

In [28]:
field_totals = []
E_totals = []
for field in sentinel_fields_data['id'].unique():
    field_totals.append(sentinel_fields_data[sentinel_fields_data['id']==field]['Wfnum'].sum(axis=0))
    E_totals.append(sentinel_fields_data[sentinel_fields_data['id']==field]['E_total'].sum(axis=0))
plt.scatter(field_totals,E_totals)
plt.show()


Observation data


In [17]:
%matplotlib inline
import pandas as pd
import numpy as np

data_loc = '../data/adult_counts_kalbar.xlsx'

release_date = pd.Timestamp('2005-03-13')

# load the grid adult counts sheet
grid_obs = pd.read_excel(data_loc,sheetname='adult counts field A')
# rename the headings with spaces in them
grid_obs.rename(columns={"x coor":"x","y coor":"y", 
                                     "num leaves viewed": "leaves",
                                     "num hayati":"obs_count"}, inplace=True)
# we don't really care about the leaf num columns
grid_obs = grid_obs[['date','collector','x','y','leaves','obs_count']]
# in our data, North was on the left of the grid. So switch coordinates
grid_obs['xcoord'] = grid_obs['y']
grid_obs['ycoord'] = -grid_obs['x'] # need to flip orientation
grid_obs.drop(['x','y'],1,inplace=True)
# put release point at the origin
grid_obs['ycoord'] += 300
grid_obs['xcoord'] -= 200
# convert date to datePR
grid_obs['datePR'] = grid_obs['date'] - release_date
grid_obs.sort_values(['datePR','xcoord','ycoord'],inplace=True)
# print(grid_obs['datePR'].map(lambda t: t.days).unique())
grid_obs.dtypes


Out[17]:
date          datetime64[ns]
collector             object
leaves                 int64
obs_count              int64
xcoord                 int64
ycoord                 int64
datePR       timedelta64[ns]
dtype: object

In [23]:
# load the first cardinal directions sheet
cardinal_obs = pd.read_excel(data_loc,sheetname='cardinal 15 mar 05')
# rename the one heading with a space
cardinal_obs.rename(columns={"num adults":"obs_count"}, inplace=True)
cardinal_obs.drop('num viewers',1,inplace=True)
cardinal_obs.dtypes


Out[23]:
date         datetime64[ns]
direction            object
distance              int64
obs_count             int64
dtype: object