Groupby: split, apply, combine


In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
print sys.version
print "Pandas:", pd.__version__


2.7.11 |Continuum Analytics, Inc.| (default, Feb 16 2016, 09:58:36) [MSC v.1500 64 bit (AMD64)]
Pandas: 0.17.0

In [2]:
cover = pd.read_csv('/Users/Peter/Documents/scc/challenge/PSQL/probably bad/cover.csv')
transect = pd.read_csv('/Users/Peter/Documents/scc/challenge/PSQL/probably bad/cover.csv')
photo = pd.read_csv('/Users/Peter/Documents/scc/challenge/PSQL/probably bad/photo.csv')
food = pd.read_csv('/Users/Peter/Documents/scc/challenge/PSQL/probably bad/food_analysis.csv')
infiltration = pd.read_csv('/Users/Peter/Documents/scc/challenge/PSQL/probably bad/infiltration.csv')

In [16]:
analysis = pd.read_csv('/Users/Peter/Documents/scc/challenge/PSQL/probably bad/analysis.csv')

In [6]:
comb = [cover, transect, photo, infiltration, food]
combined = pd.concat(comb)

In [5]:
allobs1  = pd.read_csv('/Users/Peter/Documents/scc/challenge/PSQL/allobsOct23.csv')

In [8]:
allcols = allobs1.columns.tolist()
combined['filename'] = ' '
combcols = combined.columns.tolist()


['date', 'filename', 'group', 'id', 'label1', 'label2', 'label3', 'label4', 'label5', 'note', 'obs_type', 'observer', 'type', 'url', 'value1', 'value2', 'value3', 'value4', 'value5']
['date', 'filename', 'group', 'id', 'label1', 'label2', 'label3', 'label4', 'label5', 'note', 'obs_type', 'observer', 'type', 'url', 'value1', 'value2', 'value3', 'value4', 'value5']

In [10]:
print allcols
print combcols


['group', 'id', 'type', 'obs_type', 'note', 'label1', 'value1', 'label2', 'value2', 'label3', 'value3', 'label4', 'value4', 'label5', 'value5', 'date', 'observer', 'url', 'filename']
['date', 'group', 'id', 'label1', 'label2', 'label3', 'label4', 'label5', 'note', 'obs_type', 'observer', 'type', 'url', 'value1', 'value2', 'value3', 'value4', 'value5', 'filename']

In [11]:
combined = combined[allcols]

In [12]:
combined.to_csv('/Users/Peter/Documents/scc/challenge/PSQL/combined.csv')

In [30]:
allobs.obs_type.unique()


Out[30]:
array(['transect', 'cover', 'infiltration', 'photo', 'food_analysis'], dtype=object)

In [37]:
combined.count()


Out[37]:
group       3760
id          3760
type        3760
obs_type    3760
note        3759
label1      3757
value1      3759
label2      3644
value2      3744
label3      3702
value3      3760
label4      3760
value4      3760
label5      3760
value5      3760
date        3760
observer    3760
url         3745
filename    3760
dtype: int64

In [51]:
combined.to_csv = ('/Users/Peter/Documents/scc/challenge/PSQL/combined.csv')

In [50]:
combined.describe()


Out[50]:
group id type obs_type note label1 value1 label2 value2 label3 value3 label4 value4 label5 value5 date observer url filename
count 3760 3760 3760 3760 3759 3757 3759 3644 3744 3702 3760 3760 3760 3760 3760 3760 3760 3745 3760
unique 298 298 207 5 1183 22 282 55 174 48 175 24 45 13 31 224 35 1360 1
top MORR8 MORR8 photo cover 28-Jul-11 Peter Donovan
freq 54 54 1089 2616 636 1558 1594 2016 2152 2497 2576 3316 3322 3552 3559 50 3427 1060 3760

In [ ]:


In [2]:
df = pd.read_csv('/Users/Peter/Documents/scc/challenge/PSQL/allobsOct23.csv', parse_dates=['date'])

In [3]:
df.describe()


Out[3]:
group id type obs_type note label1 value1 label2 value2 label3 value3 label4 value4 label5 value5 date observer url
count 6302 6302 6302 6302 6301 6302 6302 6302 6302 6302 6302 6302 6302 6302 6302 6304 6302 6302
unique 344 2254 325 7 1933 41 530 76 418 93 2179 34 1610 18 2016 231 141 2441
top MORR1 MORR8 CN analysis, 8 composited probe samples analysis 20-Jan-11 Peter Donovan
freq 79 41 1000 2248 2921 2115 2145 2444 2543 2968 3030 3723 3732 4102 4106 83 4294 3617

In [134]:
#decompose URLs by / and get last element (filename) with spaces replaced by hyphens

df.filename = df.url.str.split('/').str[-1:].apply(str)
df.filename = df.filename.str.replace(' ','-').str.replace('[','').str.replace(']','').str.replace("'","")
df.filename.replace('-',np.nan, inplace=True)
df.filename[7]


Out[134]:
nan

In [135]:
df.dirs = df.url.str.split('/').str[-2:-1].apply(str)

In [137]:
df.dirs.to_csv('/Users/Peter/Documents/scc/challenge/PSQL/dirs.csv')

In [122]:
empties = df[df.filename.isnull()]
empties.describe()


Out[122]:
group id type note label1 value1 label2 value2 label3 value3 label4 value4 label5 value5 date observer url
count 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278
unique 200 218 10 228 8 43 8 29 10 28 4 6 1 1 160 66 1
top RVAL1 JAFF1 plants ##############################################... 2012-10-29 00:00:00 Peter Donovan
freq 7 3 186 15 234 234 244 250 245 251 271 273 278 278 12 173 278
first NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2010-08-16 00:00:00 NaN NaN
last NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2016-08-06 00:00:00 NaN NaN

In [14]:
df.size()


Out[14]:
obs_type
analysis        2248
brix               5
clip              10
cover           1356
infiltration    1065
photo             63
transect        1555
dtype: int64

In [ ]:


In [15]:
df = pd.read_csv('/Users/Peter/Documents/scc/challenge/PSQL/analysis.csv', parse_dates=['date'])
df = df.replace(np.nan,' ', regex=True)#MUST REPLACE NAN AS NAN IS NOT ITERABLE
df.label4.unique()


Out[15]:
array([' ', '%N=', 'average=', 'Solvita ppm CO2:', 'cm3=', 'pH=',
       'tons C/ha=', 'perc. organic carbon='], dtype=object)

In [16]:
df.label3.unique()


Out[16]:
array([' ', '%C=', 'stdev=', 'CV=', 'Solvita color #:', 'dry wt, g=',
       'elec. conductivity in mS/cm=', 'perc. inorganic carbon=',
       'density=', 'perc. organic carbon=', '%OM=', '% carbon, 0-10 cm:'], dtype=object)

In [22]:
groups = df.groupby('obs_type')
groups.size()


Out[22]:
obs_type
analysis    2248
brix           5
clip          10
dtype: int64

In [45]:
groups


Out[45]:
<pandas.core.groupby.DataFrameGroupBy object at 0x00000000039CEB00>