This notebook is about classifying Challenge data according to type, and some cleanup of inconsistent terms. There are probably much better ways to classify
In [97]:
# The preamble
import pandas as pd
#pd.set_option('mode.sim_interactive', True)
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
#from collections import OrderedDict
import json, csv
import re
In [72]:
df
Out[72]:
In [54]:
#df = pd.read_csv('/Users/Peter/Documents/scc/challenge/PSQL/coverNEW.csv', parse_dates=['date'], usecols=['id','type','date','filename'])
#df = df.replace(np.nan,' ', regex=True)#MUST REPLACE NAN AS NAN IS NOT ITERABLE
#df = pd.read_csv('/Users/Peter/Documents/scc/challenge/datamungedOct2016c.csv')
df = pd.read_csv('/Users/Peter/Documents/atlas/atlasbiowork.com/atlasbiowork/db/changeREADYD.csv')
#df.dtypes
In [48]:
print df.to_json(orient='records')
In [41]:
df.to_csv('/Users/Peter/Documents/atlas/atlasbiowork.com/atlasbiowork/db/changeREADYD.csv', index=False)
In [23]:
df.values[5]
Out[23]:
In [10]:
print df.values[1]
In [24]:
#df.to_csv('/Users/Peter/Documents/scc/challenge/datamungedOct2016d.csv', index=False)
df.to_csv('/Users/Peter/Documents/scc/challenge/obs_types/changeREADYB.csv', index=False)
In [12]:
#ADD SOME COLUMNS and DROP OTHERS
df = df[df['id']!='geometry']
df['obs_type'] = ""
df['newurl'] =""
df = df.drop('lon', axis=1)
df = df.drop('marker', axis=1)
Out[12]:
In [79]:
df = df.replace(np.nan,' ', regex=True)#MUST REPLACE NAN AS NAN IS NOT ITERABLE
#analysis_terms = ['analysis', 'density', 'average','variance','pH']
def mymunge(row): # a VARIETY OF CRITERIA
if any( ['along' in row['note'], 'transect' in row['type'], 'grade' in row['type'], 'line' in row['type'], 'map' in row['type'], 'plot' in row['type'], 'remonitor' in row['type'], 'plants' in row['type']] ):
return 'transect'
if any( ['vert' in row['type'], 'ang' in row['type'], 'step back' in row['note'], 'hoop' in row['note'], 'Hoop' in row['note'], 'bare' in row['label1']] ):
return 'cover'
if any( ['infiltracion' in row['type'], 'infiltration' in row['type']] ):
return 'infiltration'
if any( ['analysis' in row['type'], 'composited' in row['type'], 'misc' in row['type'], 'density' in row['type'],'variance' in row['type'], 'average' in row['type'], 'pH' in row['type']] ):
return 'analysis'
if any( ['photo' in row['type']] ):
return 'photo'
if any( ['change' in row['type']] ):
return 'change'
if any( ['brix' in row['type'], 'biomass' in row['type'], 'clip' in row['type']] ):
return 'food_analysis'
df['obs_type'] = df.apply(mymunge, axis=1)
#SEE HOW WE DID
print(len(df))
print(df.obs_type.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=False))
In [79]:
df = df.replace(np.nan,' ', regex=True)#MUST REPLACE NAN AS NAN IS NOT ITERABLE
#analysis_terms = ['analysis', 'density', 'average','variance','pH']
def mymunge(row): # a VARIETY OF CRITERIA
if any( ['along' in row['note'], 'transect' in row['type'], 'grade' in row['type'], 'line' in row['type'], 'map' in row['type'], 'plot' in row['type'], 'remonitor' in row['type'], 'plants' in row['type']] ):
return 'transect'
if any( ['vert' in row['type'], 'ang' in row['type'], 'step back' in row['note'], 'hoop' in row['note'], 'Hoop' in row['note'], 'bare' in row['label1']] ):
return 'cover'
if any( ['infiltracion' in row['type'], 'infiltration' in row['type']] ):
return 'infiltration'
if any( ['analysis' in row['type'], 'composited' in row['type'], 'misc' in row['type'], 'density' in row['type'],'variance' in row['type'], 'average' in row['type'], 'pH' in row['type']] ):
return 'analysis'
if any( ['photo' in row['type']] ):
return 'photo'
if any( ['change' in row['type']] ):
return 'change'
if any( ['brix' in row['type'], 'biomass' in row['type'], 'clip' in row['type']] ):
return 'food_analysis'
df['obs_type'] = df.apply(mymunge, axis=1)
#SEE HOW WE DID
print(len(df))
print(df.obs_type.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=False))
In [13]:
df = pd.read_csv('/Users/Peter/Documents/scc/challenge/obs_types/change.csv')
In [10]:
df = df.replace(np.nan,'', regex=True)#MUST REPLACE NAN AS NAN IS NOT ITERABLE
def buildValues(row):
v = (row['url'], row['photo1'])
return v
df['list'] = df.apply(buildValues, axis=1)
print list(df['list'])
In [14]:
df = df.replace(np.nan,'', regex=True)#MUST REPLACE NAN AS NAN IS NOT ITERABLE
def buildValues(row): # build the values json field
v = '{"observer": "' + row['observer'] \
+ '", "label1": "' + row['label1'] \
+ '", "value1": "' + row['value1'] \
+ '", "label2": "' + row['label2'] \
+ '", "value2": "' + row['value2'] \
+ '", "label3": "' + row['label3'] \
+ '", "value3": "' + row['value3'] \
+ '", "start_date": "' + row['start_date'] \
+ '", "end_date": "' + row['end_date'] \
+ '", "description": "' + row['description'] \
+ '", "photo1": "' + row['photo1'] \
+ '", "photo2": "' + row['photo2'] \
+ '"}'
return v
df['values'] = df.apply(buildValues, axis=1)
#SEE HOW WE DID
#print(len(df))
#print(df.obs_type.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=False))
print df['values'][50]
In [15]:
print df['values'][0:10]
In [62]:
#df[df['obs_type'].isnull()]
In [47]:
df = df.replace(np.nan,' ', regex=True)#MUST REPLACE NAN AS NAN IS NOT ITERABLE
#analysis_terms = ['analysis', 'density', 'average','variance','pH']
def mymunge(row): # a VARIETY OF CRITERIA
if any( ['along' in row['note'], 'transect' in row['type'], 'grade' in row['type'], 'line' in row['type'], 'map' in row['type'], 'plot' in row['type'], 'remonitor' in row['type'], 'plants' in row['type']] ):
return 'transect'
if any( ['vert' in row['type'], 'ang' in row['type'], 'step back' in row['note'], 'bare' in row['label1']] ):
return 'cover'
if any( ['infiltracion' in row['type'], 'infiltration' in row['type']] ):
return 'infiltration'
if any( ['analysis' in row['type'], 'composited' in row['type'], 'misc' in row['type'], 'density' in row['type'],'variance' in row['type'], 'average' in row['type'], 'pH' in row['type']] ):
return 'analysis'
if any( ['photo' in row['type']] ):
return 'photo'
if any( ['change' in row['type']] ):
return 'change'
if any( ['brix' in row['type'], 'biomass' in row['type'], 'clip' in row['type']] ):
return 'food_analysis'
df['obs_type'] = df.apply(mymunge, axis=1)
#SEE HOW WE DID
print(len(df))
print(df.obs_type.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=False))
In [117]:
df = pd.read_csv('/Users/Peter/Documents/atlas/atlasbiowork.com/atlasbiowork/db/changeREADYG.csv')
#print df.to_json(orient='records')
In [108]:
df.to_csv('/Users/Peter/Documents/atlas/atlasbiowork.com/atlasbiowork/db/changeREADYH.csv', index=False)
In [131]:
def buildJSON(row): # a VARIETY OF CRITERIA
v=''
v += 'Observation(values=' + row['values']
v += ', observer_id=' + str(row['observer_id'])
v += ', site_id=' + str(row['site_id'])
v += ', type_id=' + str(row['type_id'])
v += ').save()'
return v
df['json'] = df.apply(buildJSON, axis=1)
df['json'].to_csv('/Users/Peter/Documents/atlas/atlasbiowork.com/atlasbiowork/db/changeK.csv', index=False)
In [94]:
#GET STRINGS OF LABELS "list_of_terms"
a1 = df.label1.unique()
a2 = df.label2.unique()
a3 = df.label3.unique()
#a4 = df.label4.unique()
#a5 = df.label5.unique()
list_of_terms = np.concatenate([a1,a2,a3])
#sorted(list_of_terms)
In [4]:
#SEARCH AND REPLACE
#searchterm = 'lichen'
#replaceterm = 'squish'
#the_list = [item for item in list_of_terms if searchterm in str(item)]
#need to target specific columns
mycols=df[['label1', 'label2','label3','label4','label5']]
#mycols.replace('=','wox', regex=True)
#mycols.replace(to_replace=the_list, value=replaceterm, inplace=True)
In [12]:
#df = df.replace(np.nan,' ', regex=True)
df.label4[df.label4.str.contains('litter')]
Out[12]:
In [16]:
searchterm = 'lichen'
replaceterm = 'moss/algae/lichen'
the_list = [item for item in list_of_terms if searchterm in str(item)]
#need to target specific columns
mycols=df[['label1', 'label2','label3','label4','label5']]
mycols.describe()
#mycols.replace(to_replace=the_list, value=replaceterm, inplace=True)
Out[16]:
In [6]:
searchterm = 'lichen'
replaceterm = 'moss/algae/lichen'
the_list = [item for item in list_of_terms if searchterm in str(item)]
#need to target specific columns
df[['label1', 'label2','label3','label4','label5']].replace(to_replace=thelist, value=replaceterm, inplace=True)
Out[6]:
In [29]:
transects.to_csv('/Users/Peter/Documents/scc/challenge/PSQL/transectsOct23.csv', index=False)
In [33]:
linephotos = df[(df.type.str.contains('line'))]
angphotos = df[(df.type.str.contains('ang')) | (df.note.str.contains('step back'))]
vertphotos = df[df.type.str.contains('vert')]
len(vertphotos)
#re.findall('\d+', s) #finds digits in s
def get_num(x):
digits = ''.join(ele for ele in x if ele.isdigit())
if digits:
return int(digits)
pass
#get_num('Hoop 1, 125\'')
#df.ix[459]
for y in range(len(df)):
Out[33]:
In [63]:
#basic row selection from http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/
#which has GREAT info on joins
peter_plots = df[(df.lat > 0) & (df.observer.str.contains('Peter Donovan'))]
features = df[(df.type == '*plot summary')|(df.type == 'change')|(df.type.str.contains('remonitor'))];
#df.iloc[100] and df.ix[100] get the row referred to by the default 0-based index
# df.loc['Kellogg LTER'] doesn't work because it's not an index;
# dfnew = df.set_index('id'); this works even tho id is not unique
#dfnew.loc['Kellogg LTER'] and this works; use inplace=True as arg to modify existing df
# dfnew.loc['BURR1'] returns all rows for this index
#column selector
#df[['type','label3']]; need to use double [] to enclose a list
#new column
#df['new'] = df.lat + 2
In [60]:
#df['featureID'] = df.id.str[0:6] #str function slices string
#df.type #although type is a keyword this works
In [58]:
#subsetting dataframes by finding rows
change = df[(df['type'] == "change")]
plots = df[df['type']=="*plot summary"]
peter = df[df.observer == "Peter Donovan"]
north = df[df.lat > 49] # gives 19 but df.lat > 49 gives all rows
type(change)
Out[58]:
In [89]:
food_analysis = df[(df['obs_type'] == "food_analysis")]
food_analysis.to_csv('/Users/Peter/Documents/scc/challenge/obs_types/food_analysis.csv', index=False)