notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import tempfile
import csv



In [2]:

    
CSV_COLUMN_NAMES = ['OBSERVER', 'DATE', 'SITE', 'SPECIES', 'MEASUREMENT_TYPE', 'COUNTING',]
CSV_HEADER_ROW = 2

Remove blank lines from the CSV



In [3]:

    
tmp_csv = tempfile.TemporaryFile(mode='w+b')
writer = csv.writer(tmp_csv)
with open('fixtures/example_field_data.csv', 'rU') as csv_file:
    for row in csv.reader(csv_file):
        if any(field.strip() for field in row):
            print('Writing: ' + str(row))
            writer.writerow(row)









    



Writing: ['Counting flowers or stalks per plant', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Writing: ['', '', '', 'MENU TAB', '', '', 'Measurements', '', '', '', '', '', '', '', '', '']
Writing: ['Observer', 'Date', 'Site', 'Species', 'Type', 'Counting', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
Writing: ['DWI Eg #1', '5/24/03', 'Rocky Meadow #1', 'Taraxacum officina', 'Site', 'Flowers', '390', '200', '35', '6', '', '', '', '', '', '']
Writing: ['DWI Eg #2', '7/21/09', 'Stream plot #1', 'Linum lewisii', 'Individual', 'Flowers per stalk', '2', '1', '', '', '', '', '', '', '', '']
Writing: ['DWI Eg #2', '7/21/09', 'Stream plot #1', 'Linum lewisii', 'Individual', 'Flowers per stalk', '3', '2', '', '', '', '', '', '', '', '']
Writing: ['DWI Eg #2', '7/21/09', 'Stream plot #1', 'Linum lewisii', 'Individual', 'Flowers per stalk', '', '1', '', '', '1', '', '', '', '', '']
Writing: ['DWI Eg #2', '7/21/09', 'Stream plot #1', 'Linum lewisii', 'Individual', 'Flowers per stalk', '1', '', '', '', '', '', '', '', '', '']
Writing: ['DWI Eg #2', '7/21/09', 'Rocky Meadow #1', 'Linum lewisii', 'Individual', 'Flowers per stalk', '1', '', '', '', '', '', '', '', '', '']



In [4]:

    
tmp_csv.seek(0)
df = pd.read_csv(tmp_csv, header=CSV_HEADER_ROW, index_col=[1], parse_dates=True, prefix='Measurement', na_values=[''])



In [5]:

    
df.index









    Out[5]:





<class 'pandas.tseries.index.DatetimeIndex'>
[2003-05-24 00:00:00, ..., 2009-07-21 00:00:00]
Length: 6, Freq: None, Timezone: None



In [6]:

    
df.loc[:,['Observer']]



In [7]:

    
np.unique(np.asarray(df.index))









    Out[7]:





array(['2003-05-23T20:00:00.000000000-0400',
       '2009-07-20T20:00:00.000000000-0400'], dtype='datetime64[ns]')



In [8]:

    
for name in np.unique(df.Site):
    print(name)









    



Rocky Meadow #1
Stream plot #1



In [275]:

    
for (i,r) in df.groupby([lambda x: x, lambda y: df.loc[y]['Site']]):
    print len(r)



In [9]:

    
df.groupby([lambda x: x, lambda y: df.loc[y]['Site']]).size()









    Out[9]:





2003-05-24  Rocky Meadow #1    1
2009-07-21  Rocky Meadow #1    1
            Stream plot #1     4
dtype: int64



In [21]:

    
df.iloc[0,5:].dropna()









    Out[21]:





1    390
2    200
3     35
4      6
dtype: object



In [23]:

    
pd.Series([1,2,3], index=['species']*3)









    Out[23]:





species    1
species    2
species    3
dtype: int64



In [26]:









    Out[26]:





'ab_c'

	Observer
Date
2003-05-24	DWI Eg #1
2009-07-21	DWI Eg #2
2009-07-21	DWI Eg #2
2009-07-21	DWI Eg #2
2009-07-21	DWI Eg #2
2009-07-21	DWI Eg #2