Loading/Manipulating excel data with Pandas

Emergence data



In [15]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

data_loc = '../data/sampling_details.xlsx'

release_date = pd.Timestamp('2005-03-13')

# load the sentinel fields sheet
sentinel_fields_data = pd.read_excel(data_loc,sheetname='Kal-sentinels-raw')
# rename the headings with spaces in them
sentinel_fields_data.rename(columns={"Field descrip":"descrip","date emerged":"date", 
                                     "Field ID (jpgs)": "id",
                                     "Field ID (paper)":"paperid"}, inplace=True)
sentinel_fields_data.drop('descrip',1,inplace=True)
sentinel_fields_data.drop('paperid',1,inplace=True)
sentinel_fields_data.sort_values(['id','date'], inplace=True)
# get sum of all the emergences
col_list = list(sentinel_fields_data)
for name in ['id','date']:
    col_list.remove(name)
sentinel_fields_data['All_total'] = sentinel_fields_data[col_list].sum(axis=1)
# get the number of E Hayati emergences per day
sentinel_fields_data['E_total'] = sentinel_fields_data[['Efemales','Emales']].sum(axis=1)
sentinel_fields_data['datePR'] = sentinel_fields_data['date'] - release_date
#print(sentinel_fields_data['datePR'].min().days)
#day29 = pd.Timedelta('29 days')
#sentinel_fields_data[sentinel_fields_data['datePR']==day29]['E_total'].values
#PR29 = sentinel_fields_data['datePR'] == day29
#sentinel_fields_data[PR29]
sentinel_fields_data
#sentinel_fields_data[sentinel_fields_data['E_total']>0].plot.scatter(x='Wfnum',y='E_total')



In [14]:

    
release_field_data = pd.read_excel(data_loc,sheetname='Kal-releasefield-raw')
# in our data, North was on the left of the grid. So switch coordinates
release_field_data['temp'] = release_field_data['xcoord']
release_field_data['xcoord'] = release_field_data['ycoord']
release_field_data['ycoord'] = -release_field_data['temp'] # need to flip orientation
release_field_data.drop('temp',1,inplace=True)
# put release point at the origin
release_field_data['ycoord'] += 300
release_field_data['xcoord'] -= 200
col_list = list(release_field_data)
for name in ['Field','xcoord','ycoord','date emerged']:
    col_list.remove(name)
release_field_data['All_total'] = release_field_data[col_list].sum(axis=1)
release_field_data['E_total'] = release_field_data[['Efemales','Emales']].sum(axis=1)
release_field_data['datePR'] = release_field_data['date emerged'] - release_date
release_field_data.drop('Field',1,inplace=True)
#release_field_data['ycoord'].values
release_field_data.sort_values(['datePR','xcoord','ycoord'],inplace=True)
print(release_field_data['datePR'].map(lambda t: t.days).unique())
#release_field_data









    



[21 23 25 27 29 31 34 36 38 40]



In [28]:

    
field_totals = []
E_totals = []
for field in sentinel_fields_data['id'].unique():
    field_totals.append(sentinel_fields_data[sentinel_fields_data['id']==field]['Wfnum'].sum(axis=0))
    E_totals.append(sentinel_fields_data[sentinel_fields_data['id']==field]['E_total'].sum(axis=0))
plt.scatter(field_totals,E_totals)
plt.show()

Observation data



In [17]:

    
%matplotlib inline
import pandas as pd
import numpy as np

data_loc = '../data/adult_counts_kalbar.xlsx'

release_date = pd.Timestamp('2005-03-13')

# load the grid adult counts sheet
grid_obs = pd.read_excel(data_loc,sheetname='adult counts field A')
# rename the headings with spaces in them
grid_obs.rename(columns={"x coor":"x","y coor":"y", 
                                     "num leaves viewed": "leaves",
                                     "num hayati":"obs_count"}, inplace=True)
# we don't really care about the leaf num columns
grid_obs = grid_obs[['date','collector','x','y','leaves','obs_count']]
# in our data, North was on the left of the grid. So switch coordinates
grid_obs['xcoord'] = grid_obs['y']
grid_obs['ycoord'] = -grid_obs['x'] # need to flip orientation
grid_obs.drop(['x','y'],1,inplace=True)
# put release point at the origin
grid_obs['ycoord'] += 300
grid_obs['xcoord'] -= 200
# convert date to datePR
grid_obs['datePR'] = grid_obs['date'] - release_date
grid_obs.sort_values(['datePR','xcoord','ycoord'],inplace=True)
# print(grid_obs['datePR'].map(lambda t: t.days).unique())
grid_obs.dtypes









    Out[17]:





date          datetime64[ns]
collector             object
leaves                 int64
obs_count              int64
xcoord                 int64
ycoord                 int64
datePR       timedelta64[ns]
dtype: object



In [23]:

    
# load the first cardinal directions sheet
cardinal_obs = pd.read_excel(data_loc,sheetname='cardinal 15 mar 05')
# rename the one heading with a space
cardinal_obs.rename(columns={"num adults":"obs_count"}, inplace=True)
cardinal_obs.drop('num viewers',1,inplace=True)
cardinal_obs.dtypes









    Out[23]:





date         datetime64[ns]
direction            object
distance              int64
obs_count             int64
dtype: object

	id	date	Efemales	Emales	morpho1	morpho2	morpho3	morpho4	morpho5	Wfnum	All_total	E_total	datePR
10	B	2005-04-03	0	0	2	0	0	0	0	4	6	0	21 days
11	B	2005-04-05	1	0	2	5	0	0	0	41	49	1	23 days
12	B	2005-04-07	4	2	1	0	0	0	0	21	28	6	25 days
13	B	2005-04-09	0	0	0	0	0	0	1	53	54	0	27 days
14	B	2005-04-11	1	0	0	0	0	0	0	11	12	1	29 days
15	B	2005-04-13	2	1	0	0	0	0	1	0	4	3	31 days
16	B	2005-04-16	0	0	0	0	0	0	0	0	0	0	34 days
17	B	2005-04-18	0	0	0	0	0	0	0	0	0	0	36 days
18	B	2005-04-20	0	0	0	0	0	0	0	0	0	0	38 days
19	B	2005-04-22	0	0	0	0	0	0	0	0	0	0	40 days
20	C	2005-04-03	1	0	5	0	2	1	0	6	15	1	21 days
21	C	2005-04-05	0	2	0	0	0	1	0	36	39	2	23 days
22	C	2005-04-07	1	0	2	0	0	0	0	11	14	1	25 days
23	C	2005-04-09	1	0	0	0	0	0	0	24	25	1	27 days
24	C	2005-04-11	0	0	0	0	0	0	0	0	0	0	29 days
25	C	2005-04-13	0	2	1	0	0	0	0	0	3	2	31 days
26	C	2005-04-16	0	0	0	0	0	0	0	0	0	0	34 days
27	C	2005-04-18	2	0	0	0	1	0	0	0	3	2	36 days
28	C	2005-04-20	0	0	0	0	0	0	0	0	0	0	38 days
29	C	2005-04-22	0	0	0	0	0	0	0	0	0	0	40 days
30	D	2005-04-03	1	0	0	0	0	0	0	60	61	1	21 days
31	D	2005-04-05	1	0	0	0	0	0	0	80	81	1	23 days
32	D	2005-04-07	3	0	0	0	0	0	0	44	47	3	25 days
33	D	2005-04-09	0	0	0	0	0	0	0	12	12	0	27 days
34	D	2005-04-11	5	3	0	0	0	0	1	8	17	8	29 days
35	D	2005-04-13	0	2	0	0	0	0	3	1	6	2	31 days
36	D	2005-04-16	0	0	0	0	0	0	0	0	0	0	34 days
37	D	2005-04-18	2	2	0	0	0	0	0	0	4	4	36 days
38	D	2005-04-20	3	1	0	0	0	0	0	8	12	4	38 days
39	D	2005-04-22	0	0	0	0	0	0	0	0	0	0	40 days
0	E	2005-04-03	0	0	1	0	0	0	0	39	40	0	21 days
1	E	2005-04-05	0	0	0	0	0	0	0	155	155	0	23 days
2	E	2005-04-07	1	0	0	1	0	0	0	69	71	1	25 days
3	E	2005-04-09	0	1	0	1	0	0	0	60	62	1	27 days
4	E	2005-04-11	0	0	0	0	0	0	0	0	0	0	29 days
5	E	2005-04-13	1	0	0	0	0	0	1	0	2	1	31 days
6	E	2005-04-16	2	2	0	0	0	0	0	0	4	4	34 days
7	E	2005-04-18	2	5	0	0	0	0	0	0	7	7	36 days
8	E	2005-04-20	0	0	0	0	0	0	0	0	0	0	38 days
9	E	2005-04-22	0	0	0	0	0	0	0	0	0	0	40 days
40	F	2005-04-03	0	0	0	0	0	0	0	0	0	0	21 days
41	F	2005-04-05	0	0	0	0	0	0	0	227	227	0	23 days
42	F	2005-04-07	0	0	0	0	0	0	0	75	75	0	25 days
43	F	2005-04-09	0	0	0	0	0	0	0	44	44	0	27 days
44	F	2005-04-11	0	0	0	0	0	0	0	2	2	0	29 days
45	F	2005-04-13	1	0	0	0	0	0	0	5	6	1	31 days
46	F	2005-04-16	0	0	0	0	0	0	0	0	0	0	34 days
47	F	2005-04-18	0	0	0	0	0	0	0	0	0	0	36 days
48	F	2005-04-20	0	0	0	0	0	0	2	0	2	0	38 days
49	F	2005-04-22	0	0	0	0	0	0	0	0	0	0	40 days
50	G	2005-04-03	0	0	0	0	0	0	0	400	400	0	21 days
51	G	2005-04-05	0	0	0	0	0	0	0	240	240	0	23 days
52	G	2005-04-07	0	0	0	0	0	0	0	61	61	0	25 days
53	G	2005-04-09	0	0	0	0	0	0	0	0	0	0	27 days
54	G	2005-04-11	3	0	0	3	0	1	0	0	7	3	29 days
55	G	2005-04-13	4	4	0	0	0	0	17	2	27	8	31 days
56	G	2005-04-16	0	0	0	0	0	0	0	0	0	0	34 days
57	G	2005-04-18	0	0	0	0	0	0	0	0	0	0	36 days
58	G	2005-04-20	0	0	0	0	0	0	0	0	0	0	38 days
59	G	2005-04-22	0	0	0	0	0	0	0	0	0	0	40 days