Formatting csv data for loading into atlasbiowork Postgres database

First, get column names set up. Implement foreign keys FIRST, as csv, and then by join operation with site table.

Then use to_json to nest the values fields for the postgres JSON field.


In [1]:
import pandas as pd
import numpy as np
import json

In [60]:
#fields for csv
site_fields = ['id', 'name', 'geometry','accuracy']
observation_fields = ['entered', 'values','observer_id', 'site_id', 'type_id', 'parentobs_id']

soil samples from analysis.csv


In [79]:
df = pd.read_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/analysis.csv')
#add foreign key fields

mystartID = 1000 #primary key to start numbering new data from
df['observer_id'] = 1 # this is my observer_id
df['site_id'] = np.nan
df['type_id'] = np.nan
df['parentobs_id'] = np.nan
df['id']=df.index+mystartID

df.columns


Out[79]:
Index(['group', 'id', 'type', 'label1', 'value1', 'label2', 'value2', 'label3',
       'value3', 'label4', 'value4', 'label5', 'value5', 'date', 'observer',
       'url', 'note', 'obs_type', 'newurl', 'observer_id', 'site_id',
       'type_id', 'parentobs_id'],
      dtype='object')

For soil samples, type=31 and values fields are as follows

"values": { "top_cm": "28", "bottom_cm": "35", "description": "3-inch diameter density sample", "num_composited": "1", "sampleID": "Linne1C1", "date": "2017-04-11", "samplers": null }


In [4]:
#get soil samples fields
soil_samples_renaming = {"value1": "top_cm", "value2": "bottom_cm","date": "oldDate", "id": "sampleID", "type": "description"}
df.rename(columns=soil_samples_renaming, inplace=True)
df['date'] = pd.to_datetime(df['oldDate'],infer_datetime_format=True)
df.columns


Out[4]:
Index(['group', 'sampleID', 'description', 'label1', 'top_cm', 'label2',
       'bottom_cm', 'label3', 'value3', 'label4', 'value4', 'label5', 'value5',
       'oldDate', 'observer', 'url', 'note', 'obs_type', 'newurl', 'date'],
      dtype='object')

In [5]:
#add a few needed fields
df['entered'] = "2017-06-01 00:00:00.000" #arbitrary for loading data
df['observer_id'] = 1 #given that all these observations are mine
df['site_id'] = 0
df['type_id'] = 31 # for soil samples
df['parentobs_id'] = 0

df['samplers'] = ''

In [6]:
#use regex to replace substrings with numbers for num_composited field
replacements = {
  r'8': 8,
  r'3': 3,
  r'4': 4,
  r'pit':4,
  r'single': 1,
  r'density': 1
}

df['num_composited'] = df.description.replace(replacements, regex=True)
#df.loc[df.text.str.contains('\.'), 'text'] = 'other'
df.num_composited.value_counts() #gives occurrences of each unique value


Out[6]:
8                          1106
1                          1101
C variance                   28
4                            16
average CN                   12
organic matter analysis       8
C average                     3
layer average                 2
average BD                    2
tons C in layer               2
3                             1
Name: num_composited, dtype: int64

In [7]:
#here we filter for the soil samples only, not the analyses or calculated stats
searchfor = ['single','density','composite sample','8','4','3']
#y = df[df.description.str.contains('|'.join(searchfor))] #df w rows that contain terms
#x = df[~df.description.str.contains('|'.join(searchfor))] #df without rows that contain terms
df = df[df.description.str.contains('|'.join(searchfor))] #df w rows that contain terms

In [8]:
df['description'] = df['description'] + ". " + df['note']

In [10]:
#in order to make a few text changes, e.g. describe samples a bit more
#df.to_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples.csv', index=False)
df = pd.read_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples.csv')

In [14]:
df=pd.read_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples.csv')
JSONfield = ['top_cm', 'bottom_cm', 'description','num_composited','sampleID','date','samplers']
jsonvalues= df[JSONfield]
jsonvalues.columns


Out[14]:
Index(['top_cm', 'bottom_cm', 'description', 'num_composited', 'sampleID',
       'date', 'samplers'],
      dtype='object')

In [15]:
#create dataframe with same length to hold JSON field
json = pd.DataFrame(index = df.index, columns = ['values'])

In [16]:
for i, row in jsonvalues.iterrows():
    json.values[i]= jsonvalues.loc[i].to_json()
    #print(values.values[i])

In [17]:
#now we create a df with all fields, including the JSON values field
merged = df.merge(json, left_index=True, right_index=True)

In [19]:
merged.to_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples.csv', index=False)

In [30]:
mystart = 1000 #primary key to start with
merged['id'] = merged.index + mystart

In [42]:
observation_fields


Out[42]:
['id',
 'entered',
 'values',
 'observer_id',
 'site_id',
 'type_id',
 'parentobs_id',
 'group',
 'group']

In [44]:
#observation_fields.append('group')
final = merged[observation_fields]
final


Out[44]:
id entered values observer_id site_id group type_id parentobs_id
0 1000 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 ANSP01 31 0
1 1001 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 ANSP01 31 0
2 1002 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 ANSP01 31 0
3 1003 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 ANSP01 31 0
4 1004 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 ANSP01 31 0
5 1005 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 ANSP01 31 0
6 1006 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 ANSP02 31 0
7 1007 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 ANSP02 31 0
8 1008 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 ANSP02 31 0
9 1009 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 ANSP02 31 0
10 1010 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 ANSP02 31 0
11 1011 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 ANSP02 31 0
12 1012 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 ANSP03 31 0
13 1013 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 ANSP03 31 0
14 1014 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 ANSP03 31 0
15 1015 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 ANSP03 31 0
16 1016 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 ANSP03 31 0
17 1017 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 ANSP03 31 0
18 1018 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 ANSP04 31 0
19 1019 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 ANSP04 31 0
20 1020 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"20","description":... 1 0 ANSP04 31 0
21 1021 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 ANSP04 31 0
22 1022 2017-04-01 00:00:00.000 {"top_cm":"20","bottom_cm":"35","description":... 1 0 ANSP04 31 0
23 1023 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 ANSP04 31 0
24 1024 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 ANSP05 31 0
25 1025 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 ANSP05 31 0
26 1026 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"20","description":... 1 0 ANSP05 31 0
27 1027 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 ANSP05 31 0
28 1028 2017-04-01 00:00:00.000 {"top_cm":"20","bottom_cm":"35","description":... 1 0 ANSP05 31 0
29 1029 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 ANSP05 31 0
... ... ... ... ... ... ... ... ...
2194 3194 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2195 3195 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 WORK1 31 0
2196 3196 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2197 3197 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2198 3198 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2199 3199 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2200 3200 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2201 3201 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2202 3202 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 WORK2 31 0
2203 3203 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 WORK2 31 0
2204 3204 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 WORK2 31 0
2205 3205 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 WORK2 31 0
2206 3206 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 WORK2 31 0
2207 3207 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 WORK2 31 0
2208 3208 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 WORK2 31 0
2209 3209 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK2 31 0
2210 3210 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK2 31 0
2211 3211 2017-04-01 00:00:00.000 {"top_cm":"30","bottom_cm":"34","description":... 1 0 WORK2 31 0
2212 3212 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"40","description":"... 1 0 WORK2 31 0
2213 3213 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"40","description":"... 1 0 WORK2 31 0
2214 3214 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 WORK3 31 0
2215 3215 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 WORK3 31 0
2216 3216 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 WORK3 31 0
2217 3217 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 WORK3 31 0
2218 3218 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 WORK3 31 0
2219 3219 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 WORK3 31 0
2220 3220 2017-04-01 00:00:00.000 {"top_cm":"12","bottom_cm":"16","description":... 1 0 WORK3 31 0
2221 3221 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK3 31 0
2222 3222 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK3 31 0
2223 3223 2017-04-01 00:00:00.000 {"top_cm":"26","bottom_cm":"30","description":... 1 0 WORK3 31 0

2224 rows × 8 columns


In [45]:
final.to_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples_readyFK.csv', index=False)

In [48]:
final = pd.read_csv('C:/Users/Peter/Documents/scc/challenge/obs_types/soil_samples_readyFK.csv')
final


Out[48]:
id entered values observer_id site_id group type_id parentobs_id
0 1000 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 ANSP01 31 0
1 1001 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 ANSP01 31 0
2 1002 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 ANSP01 31 0
3 1003 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 ANSP01 31 0
4 1004 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 ANSP01 31 0
5 1005 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 ANSP01 31 0
6 1006 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 ANSP02 31 0
7 1007 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 ANSP02 31 0
8 1008 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 ANSP02 31 0
9 1009 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 ANSP02 31 0
10 1010 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 ANSP02 31 0
11 1011 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 ANSP02 31 0
12 1012 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 ANSP03 31 0
13 1013 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 ANSP03 31 0
14 1014 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 ANSP03 31 0
15 1015 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 ANSP03 31 0
16 1016 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 ANSP03 31 0
17 1017 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 ANSP03 31 0
18 1018 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 ANSP04 31 0
19 1019 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 ANSP04 31 0
20 1020 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"20","description":... 1 0 ANSP04 31 0
21 1021 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 ANSP04 31 0
22 1022 2017-04-01 00:00:00.000 {"top_cm":"20","bottom_cm":"35","description":... 1 0 ANSP04 31 0
23 1023 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 ANSP04 31 0
24 1024 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 ANSP05 31 0
25 1025 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 ANSP05 31 0
26 1026 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"20","description":... 1 0 ANSP05 31 0
27 1027 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 ANSP05 31 0
28 1028 2017-04-01 00:00:00.000 {"top_cm":"20","bottom_cm":"35","description":... 1 0 ANSP05 31 0
29 1029 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 ANSP05 31 0
... ... ... ... ... ... ... ... ...
2194 3194 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2195 3195 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 WORK1 31 0
2196 3196 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2197 3197 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2198 3198 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2199 3199 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2200 3200 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2201 3201 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK1 31 0
2202 3202 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 WORK2 31 0
2203 3203 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 WORK2 31 0
2204 3204 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 WORK2 31 0
2205 3205 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 WORK2 31 0
2206 3206 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 WORK2 31 0
2207 3207 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 WORK2 31 0
2208 3208 2017-04-01 00:00:00.000 {"top_cm":"14","bottom_cm":"18","description":... 1 0 WORK2 31 0
2209 3209 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK2 31 0
2210 3210 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK2 31 0
2211 3211 2017-04-01 00:00:00.000 {"top_cm":"30","bottom_cm":"34","description":... 1 0 WORK2 31 0
2212 3212 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"40","description":"... 1 0 WORK2 31 0
2213 3213 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"40","description":"... 1 0 WORK2 31 0
2214 3214 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 WORK3 31 0
2215 3215 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 WORK3 31 0
2216 3216 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 WORK3 31 0
2217 3217 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 WORK3 31 0
2218 3218 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 WORK3 31 0
2219 3219 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 WORK3 31 0
2220 3220 2017-04-01 00:00:00.000 {"top_cm":"12","bottom_cm":"16","description":... 1 0 WORK3 31 0
2221 3221 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK3 31 0
2222 3222 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 WORK3 31 0
2223 3223 2017-04-01 00:00:00.000 {"top_cm":"26","bottom_cm":"30","description":... 1 0 WORK3 31 0

2224 rows × 8 columns


In [52]:
final[final['group']=='BCLA1']


Out[52]:
id entered values observer_id site_id group type_id parentobs_id
133 1133 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":"... 1 0 BCLA1 31 0
134 1134 2017-04-01 00:00:00.000 {"top_cm":"0","bottom_cm":"10","description":n... 1 0 BCLA1 31 0
135 1135 2017-04-01 00:00:00.000 {"top_cm":"4","bottom_cm":"8","description":"3... 1 0 BCLA1 31 0
136 1136 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 BCLA1 31 0
137 1137 2017-04-01 00:00:00.000 {"top_cm":"10","bottom_cm":"25","description":... 1 0 BCLA1 31 0
138 1138 2017-04-01 00:00:00.000 {"top_cm":"18","bottom_cm":"22","description":... 1 0 BCLA1 31 0
139 1139 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 BCLA1 31 0
140 1140 2017-04-01 00:00:00.000 {"top_cm":"25","bottom_cm":"40","description":... 1 0 BCLA1 31 0
141 1141 2017-04-01 00:00:00.000 {"top_cm":"28","bottom_cm":"32","description":... 1 0 BCLA1 31 0