In [1]:
import pandas as pd
import numpy as np
import json
In [60]:
#fields for csv
site_fields = ['id', 'name', 'geometry','accuracy']
observation_fields = ['entered', 'values','observer_id', 'site_id', 'type_id', 'parentobs_id']
In [79]:
df = pd.read_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/analysis.csv')
#add foreign key fields
mystartID = 1000 #primary key to start numbering new data from
df['observer_id'] = 1 # this is my observer_id
df['site_id'] = np.nan
df['type_id'] = np.nan
df['parentobs_id'] = np.nan
df['id']=df.index+mystartID
df.columns
Out[79]:
For soil samples, type=31 and values fields are as follows
"values": { "top_cm": "28", "bottom_cm": "35", "description": "3-inch diameter density sample", "num_composited": "1", "sampleID": "Linne1C1", "date": "2017-04-11", "samplers": null }
In [4]:
#get soil samples fields
soil_samples_renaming = {"value1": "top_cm", "value2": "bottom_cm","date": "oldDate", "id": "sampleID", "type": "description"}
df.rename(columns=soil_samples_renaming, inplace=True)
df['date'] = pd.to_datetime(df['oldDate'],infer_datetime_format=True)
df.columns
Out[4]:
In [5]:
#add a few needed fields
df['entered'] = "2017-06-01 00:00:00.000" #arbitrary for loading data
df['observer_id'] = 1 #given that all these observations are mine
df['site_id'] = 0
df['type_id'] = 31 # for soil samples
df['parentobs_id'] = 0
df['samplers'] = ''
In [6]:
#use regex to replace substrings with numbers for num_composited field
replacements = {
r'8': 8,
r'3': 3,
r'4': 4,
r'pit':4,
r'single': 1,
r'density': 1
}
df['num_composited'] = df.description.replace(replacements, regex=True)
#df.loc[df.text.str.contains('\.'), 'text'] = 'other'
df.num_composited.value_counts() #gives occurrences of each unique value
Out[6]:
In [7]:
#here we filter for the soil samples only, not the analyses or calculated stats
searchfor = ['single','density','composite sample','8','4','3']
#y = df[df.description.str.contains('|'.join(searchfor))] #df w rows that contain terms
#x = df[~df.description.str.contains('|'.join(searchfor))] #df without rows that contain terms
df = df[df.description.str.contains('|'.join(searchfor))] #df w rows that contain terms
In [8]:
df['description'] = df['description'] + ". " + df['note']
In [10]:
#in order to make a few text changes, e.g. describe samples a bit more
#df.to_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples.csv', index=False)
df = pd.read_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples.csv')
In [14]:
df=pd.read_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples.csv')
JSONfield = ['top_cm', 'bottom_cm', 'description','num_composited','sampleID','date','samplers']
jsonvalues= df[JSONfield]
jsonvalues.columns
Out[14]:
In [15]:
#create dataframe with same length to hold JSON field
json = pd.DataFrame(index = df.index, columns = ['values'])
In [16]:
for i, row in jsonvalues.iterrows():
json.values[i]= jsonvalues.loc[i].to_json()
#print(values.values[i])
In [17]:
#now we create a df with all fields, including the JSON values field
merged = df.merge(json, left_index=True, right_index=True)
In [19]:
merged.to_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples.csv', index=False)
In [30]:
mystart = 1000 #primary key to start with
merged['id'] = merged.index + mystart
In [42]:
observation_fields
Out[42]:
In [44]:
#observation_fields.append('group')
final = merged[observation_fields]
final
Out[44]:
In [45]:
final.to_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples_readyFK.csv', index=False)
In [48]:
final = pd.read_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples_readyFK.csv')
final
Out[48]:
In [52]:
final[final['group']=='BCLA1']
Out[52]: