In [1]:
import pandas as pd
import numpy as np
import tempfile
import csv

In [2]:
CSV_COLUMN_NAMES = ['OBSERVER', 'DATE', 'SITE', 'SPECIES', 'MEASUREMENT_TYPE', 'COUNTING',]
CSV_HEADER_ROW = 2

Remove blank lines from the CSV


In [3]:
tmp_csv = tempfile.TemporaryFile(mode='w+b')
writer = csv.writer(tmp_csv)
with open('fixtures/example_field_data.csv', 'rU') as csv_file:
    for row in csv.reader(csv_file):
        if any(field.strip() for field in row):
            print('Writing: ' + str(row))
            writer.writerow(row)


Writing: ['Counting flowers or stalks per plant', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Writing: ['', '', '', 'MENU TAB', '', '', 'Measurements', '', '', '', '', '', '', '', '', '']
Writing: ['Observer', 'Date', 'Site', 'Species', 'Type', 'Counting', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
Writing: ['DWI Eg #1', '5/24/03', 'Rocky Meadow #1', 'Taraxacum officina', 'Site', 'Flowers', '390', '200', '35', '6', '', '', '', '', '', '']
Writing: ['DWI Eg #2', '7/21/09', 'Stream plot #1', 'Linum lewisii', 'Individual', 'Flowers per stalk', '2', '1', '', '', '', '', '', '', '', '']
Writing: ['DWI Eg #2', '7/21/09', 'Stream plot #1', 'Linum lewisii', 'Individual', 'Flowers per stalk', '3', '2', '', '', '', '', '', '', '', '']
Writing: ['DWI Eg #2', '7/21/09', 'Stream plot #1', 'Linum lewisii', 'Individual', 'Flowers per stalk', '', '1', '', '', '1', '', '', '', '', '']
Writing: ['DWI Eg #2', '7/21/09', 'Stream plot #1', 'Linum lewisii', 'Individual', 'Flowers per stalk', '1', '', '', '', '', '', '', '', '', '']
Writing: ['DWI Eg #2', '7/21/09', 'Rocky Meadow #1', 'Linum lewisii', 'Individual', 'Flowers per stalk', '1', '', '', '', '', '', '', '', '', '']

In [4]:
tmp_csv.seek(0)
df = pd.read_csv(tmp_csv, header=CSV_HEADER_ROW, index_col=[1], parse_dates=True, prefix='Measurement', na_values=[''])

In [5]:
df.index


Out[5]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2003-05-24 00:00:00, ..., 2009-07-21 00:00:00]
Length: 6, Freq: None, Timezone: None

In [6]:
df.loc[:,['Observer']]


Out[6]:
Observer
Date
2003-05-24 DWI Eg #1
2009-07-21 DWI Eg #2
2009-07-21 DWI Eg #2
2009-07-21 DWI Eg #2
2009-07-21 DWI Eg #2
2009-07-21 DWI Eg #2

In [7]:
np.unique(np.asarray(df.index))


Out[7]:
array(['2003-05-23T20:00:00.000000000-0400',
       '2009-07-20T20:00:00.000000000-0400'], dtype='datetime64[ns]')

In [8]:
for name in np.unique(df.Site):
    print(name)


Rocky Meadow #1
Stream plot #1

In [275]:
for (i,r) in df.groupby([lambda x: x, lambda y: df.loc[y]['Site']]):
    print len(r)


1
1
4

In [9]:
df.groupby([lambda x: x, lambda y: df.loc[y]['Site']]).size()


Out[9]:
2003-05-24  Rocky Meadow #1    1
2009-07-21  Rocky Meadow #1    1
            Stream plot #1     4
dtype: int64

In [21]:
df.iloc[0,5:].dropna()


Out[21]:
1    390
2    200
3     35
4      6
dtype: object

In [23]:
pd.Series([1,2,3], index=['species']*3)


Out[23]:
species    1
species    2
species    3
dtype: int64

In [26]:



Out[26]:
'ab_c'