This notebook demonstrates how to make and use a Python Contribution object. A Contribution is built by reading in MagIC tables from a single directory. Those tables are then stored in pandas DataFrames.
In [1]:
# do basic imports and unpack McMurdo data
from pmagpy import ipmag
reload(ipmag)
from pmagpy import pmag
from pmagpy import new_builder as nb
from pmagpy import data_model3
import os
import pandas as pd
import numpy as np
from pandas import DataFrame
from pmagpy.new_builder import Contribution
wdir = os.path.join("..", "3_0", "McMurdo")
#infile = os.path.join(wdir, "lawrence09.v30.txt")
#infile = os.path.join(wdir, "mcmurdo3-with-upgrade.txt")
#ipmag.download_magic(infile, overwrite=True, dir_path=wdir)
In [2]:
reload(nb)
# test out various ways of creating a contribution
#class Contribution(object):
# """
# A Contribution is a collection of MagicDataFrames,
# each of which corresponds to one MagIC table.
# The Contribution object also has methods for
# manipulating one or more tables in the contribution --
# for example, renaming a site.
# """
# def __init__(self, directory, read_tables='all',
# custom_filenames=None, single_file=None):
# make contribution reading in all default filenames from working directory
wdir = os.path.join("..", "3_0", "McMurdo")
con = nb.Contribution(wdir)
print 'tables created:', con.tables.keys()
print '-'
# make contribution with some custom filenames
con = nb.Contribution(wdir, custom_filenames={'specimens': 'custom_specimens.txt'})
print 'tables created:', con.tables.keys()
print '-'
# make contribution with custom filenames, and only read in the specimen table to start
con = Contribution(wdir, read_tables=['specimens'], custom_filenames={'sites': 'custom_sites.txt',
'specimens': 'custom_specimens.txt'})
print 'tables created:', con.tables.keys()
print '-'
# make contribution with a single, mystery file (can be any datatype)
con = nb.Contribution(wdir, single_file='sites.txt')
print 'tables created:', con.tables.keys()
print '-'
In [3]:
# make McMurdo contribution, starting with specimens table
reload(nb)
con = nb.Contribution(wdir, read_tables=['specimens'], custom_filenames={'specimens': 'custom_specimens.txt', 'samples': 'custom_samples.txt',
'sites': 'custom_sites.txt'})
print con.filenames
print con.tables.keys()
In [4]:
# then, add another table to the contribution
# here, we are providing data type but no filename
# this works because we already gave the custom sample filename when we created the contribution
# so the contribution already knows where to look (con.filenames)
con.add_magic_table('samples')
print con.tables.keys()
In [5]:
# add another table to the same contribution
# this time, provide a filename but no data type
con.add_magic_table(dtype="unknown", fname="criteria.txt")
# criteria table now included
print con.tables.keys()
In [6]:
# create full McMurdo contribution
reload(nb)
con = nb.Contribution(wdir, custom_filenames={'specimens': 'specimens.txt', 'samples': 'samples.txt',
'sites': 'sites.txt'})
In [7]:
con.tables['locations'].df[['sites']]
Out[7]:
In [8]:
reload(nb)
# rename one of the Contribution's sites
con.rename_item('sites', 'mc03', 'extra_special_site')
con.tables['sites'].df.ix[['extra_special_site']]
# all rows previously named 'mc01' are now named 'extra_special_site'
Out[8]:
In [9]:
# additionally, 'mc03' has been replaced in the location table under site_names
#con.tables['locations'].df.ix[["Osler Volcanics, Nipigon Strait, Lower Reversed"]][['site_names']]
con.tables['locations'].df[['sites']]#, 'sites_list']]
Out[9]:
In [10]:
# normally, each table only has one relationship up (i.e., a measurement table will have specimen name, but not sample name)
# sometimes, you need to access location_name at the site level (for example)
# this function propagates names down through any available tables
# the code snippet below won't work if the Contribution can't access the sample and site files!
reload(nb)
con = nb.Contribution(wdir, custom_filenames={'specimens': 'custom_specimens.txt', 'samples': 'custom_samples.txt',
'sites': 'custom_sites.txt'})
con.propagate_name_down('location', 'specimens')
# specimens table now has sample, site, and location_names
con.tables['specimens'].df[['specimen', 'sample', 'site', 'location']].head()
Out[10]:
In [11]:
# this function propagates values from arbitrary columns down
# i.e., get sample-level azimuth into the measurements table
# note: this will NOT work with names (specimen, sample, etc.).
# for those relationships, use the above function: propagate_name_down
reload(nb)
con = nb.Contribution(wdir, custom_filenames={'specimens': 'custom_specimens.txt', 'samples': 'custom_samples.txt',
'sites': 'custom_sites.txt'})
meas_container = con.tables['measurements']
meas_df = meas_container.df
meas_df = con.propagate_cols_down(['azimuth', 'dip', 'fake_col'], 'measurements', 'samples')
meas_df.head()[['azimuth', 'dip']]
Out[11]:
In [12]:
con = nb.Contribution(wdir, custom_filenames={'specimens': 'custom_specimens.txt', 'samples': 'custom_samples.txt',
'sites': 'custom_sites.txt'})
samp_container = con.tables['samples']
samp_container.write_magic_file(custom_name='_samples.txt', dir_path='../3_0/McMurdo')
samp_container.df
Out[12]:
In [13]:
# columns is a list whichever columns you want
dtype = "specimens"
cols = ["col_name1", "col_name2"]
data_container = nb.MagicDataFrame(dtype=dtype, columns=cols)
# or:
dtype = "specimens"
groups = ["Age", "Metadata"]
data_container = nb.MagicDataFrame(dtype=dtype, groups=groups)
# and then:
con.tables[dtype] = data_container
con.tables[dtype].df
con.add_empty_magic_table('fake', col_names=['col1', 'col2'])
con.add_empty_magic_table('images', col_names=['col1', 'col2'])
con.tables['images'].df
Out[13]:
In [14]:
reload(nb)
wdir = os.path.join("..", "3_0", "McMurdo")
con = nb.Contribution(wdir)
con.tables.keys()
Out[14]:
In [15]:
# add a new sample
site_name = con.tables['sites'].df.index[0]
samp_name = 'new_sample'
data = {'sample': samp_name, 'site': site_name}
con.add_item('samples', data, samp_name)
con.tables['samples'].df.tail()
Out[15]:
In [16]:
con = nb.Contribution(wdir, custom_filenames={'specimens': 'custom_specimens.txt', 'samples': 'custom_samples.txt',
'sites': 'custom_sites.txt'})
meas_container = con.tables['measurements']
meas_data = meas_container.df
meas_data['treatment'] = meas_data['treat_ac_field'].where(cond=meas_data['treat_ac_field'] != "0", other=meas_data['treat_temp'])
meas_data[['treatment', 'treat_ac_field', 'treat_temp']]
meas_data['treat_ac_field'].ix[0] = None
meas_data['treat_ac_field'] = meas_data['treat_ac_field'].astype(float)
meas_data[['treatment', 'treat_ac_field', 'treat_temp']].head()
Out[16]:
In [17]:
# grab a copy of the criteria and sites table to play with
criteria = con.tables['criteria'].df.copy()
sites = con.tables['sites'].df.copy()
locations = con.tables['locations'].df.copy()
specimens = con.tables['specimens'].df.copy()
samples = con.tables['samples'].df.copy()
In [18]:
criteria.index
Out[18]:
In [19]:
cond = criteria.index.str.contains('sample')
samp_crit = criteria[cond].copy()
In [20]:
# get all criteria for samples
# only criteria with 'samples' in table_column_name
cond = criteria.index.str.contains('sample')
samp_crit = criteria[cond].copy()
# remove table name from index
if len(samp_crit):
samp_crit.index = samp_crit.index.str.replace('samples.', '')
samp_crit.index.name = 'column_name'
cols = samp_crit.index
list(cols)
cols = list(cols)
samp_crit
Out[20]:
In [21]:
#sites.head().ix[cols]
In [22]:
# create string --> operator conversion
import operator
ops = {"<": operator.lt, ">": operator.gt, "==": operator.eq, "<=": operator.le, ">=": operator.gt}
# function for applying criteria
In [23]:
# create full McMurdo contribution
reload(nb)
con = nb.Contribution(wdir, custom_filenames={'specimens': 'specimens.txt', 'samples': 'samples.txt',
'sites': 'sites.txt'})
criteria = con.tables['criteria'].df.copy()
def apply_crit(series, crit_series):#, criteria_type):
"""
Apply 1 criterion (i.e., 1 row of the criteria table) to another table.
Return series with boolean values for whether the row passes.
"""
col_name = crit_series.name
# if there's no value, pass == True
if col_name not in series:
return True
if not series[col_name]:
return True
#return "{} not in row".format(col_name)
# if there is a value, test that it is within correct limits
crit_name = crit_series['criterion']
crit_value = float(crit_series['criterion_value'])
op_str = crit_series['criterion_operation']
op = ops[op_str]
value = float(series[col_name])
result = op(value, crit_value)
return result
def add_criteria_named(category_name, dtype):
df = con.tables[dtype].df
criteria_subset = criteria[criteria['criterion'] == category_name]
criteria_subset.index = criteria_subset.index.str.replace(dtype + '.', '')
pass_col_names = []
for crit_name, crit_row in criteria_subset.iterrows():
#print 'crit_name', crit_name
col_name = category_name + "_" + crit_name + "_pass"
#print 'col_name', col_name
pass_col_names.append(col_name)
df[col_name] = df.apply(apply_crit, args=(crit_row,), axis=1)
return pass_col_names
#DE_SPEC = criteria[criteria['criterion'] == 'DE-SPEC']
#DE_SPEC.index = DE_SPEC.index.str.replace('specimens.', '')
#pass_col_names = []
#for crit_name, crit_row in DE_SPEC.iterrows():
# #print 'crit_name', crit_name
# col_name = 'DE-SPEC_' + crit_name + "_pass"
# #print 'col_name', col_name
# pass_col_names.append(col_name)
# specimens[col_name] = specimens.apply(apply_crit, args=(crit_row,), axis=1)
dtype = 'specimens'
criteria_name = 'IE-SPEC'
pass_col = criteria_name + "_pass"
pass_col_names = add_criteria_named(criteria_name, dtype)
print pass_col_names
df = con.tables[dtype].df
col_names = df.columns[df.columns.str.contains(criteria_name)]
df[df[col_names].all(1)]#[col_names]
## all specimens that pass all DE-SPEC criteria
#df[df[pass_col]].index
#df.head()
df.head()[col_names]
Out[23]:
In [24]:
reload(nb)
import pmagpy.pmag
reload(pmagpy.pmag)
con = nb.Contribution(wdir)#, custom_filenames={'specimens': 'custom_specimens.txt', 'samples': 'custom_samples.txt',
#'sites': 'custom_sites.txt'})
con.tables
Out[24]:
In [25]:
site_container = con.tables['sites']
site_df = con.tables['sites'].df
#thingee = set([0, 4])
#site_df.index[list(thingee)]
to_drop = [0, 4]
#site_df.drop(site_df.iloc[list(thingee)], inplace=True)
# this doesn't work, because it drops extra values with the same index value (mc01)
#site_df.drop(to_drop, inplace=True)
# this works
df = site_df.iloc[sorted(set(range(len(site_df))) - set([0, 4]))]
# this works
df = site_df.iloc[[i for i in range(len(site_df)) if i not in to_drop]]
# this works
site_df = site_df.reset_index(drop=True).drop(to_drop).set_index('site')
site_df.columns[site_df.columns.str.contains('age')]
site_df.columns[site_df.columns.str.contains("age($|_).*")]
# Solution in new_builder:
site_container.delete_row(1)
site_container.delete_row(1).head()
Out[25]:
In [26]:
# See data_model_conversion.ipynb
In [27]:
# get minimum/maximum latitude/longitude grouped by location
# set up
wdir = os.path.join("..", '3_0', 'McMurdo')
con = nb.Contribution(wdir, single_file='sites.txt')
site_container = con.tables['sites']
site_df = site_container.df
# Fill in some values
site_container.df['lon'] = ''
site_container.df.iloc[1] = pd.Series({'lon': '2', 'location': 'McMurdo'})
site_container.df.iloc[2] = pd.Series({'location': 'McMurdo2', 'lat': '14.2'})
# fill in string values with None or np.nan
site_container.df['lon'] = np.where(site_container.df['lon'].str.len(), site_container.df['lon'], None)
site_container.df['lat'] = np.where(site_container.df['lat'].str.len(), site_container.df['lat'], None)
# group lat/lon by location name
print 'about to group'
site_container.df['lon'] = site_container.df['lon'].astype(float)
grouped_lon = site_container.df[['lon', 'location']].dropna().groupby('location')
grouped_lat = site_container.df[['lat', 'location']].dropna().groupby('location')
#grouped_lon = site_container.df['lon'].astype(float).dropna().groupby(site_container.df['location'])
#grouped_lat = site_container.df['lat'].astype(float).dropna().groupby(site_container.df['location'])
# get output
print "max longitude:"
print grouped_lon.max()
print ''
print 'min latitude:'
print grouped_lat.min()
In [28]:
site_container.df[['lon', 'location']].dropna(subset=['lon']).groupby('location')
Out[28]:
In [29]:
reload(nb)
wdir = os.path.join("..", '3_0', 'McMurdo')
con = nb.Contribution(wdir)#, custom_filenames={'specimens': 'custom_specimens.txt', 'samples': 'custom_samples.txt',
In [30]:
print 'lithologies in sample table?', 'lithologies' in con.tables['samples'].df.columns
con.tables['sites'].df.lithologies = 'litho1'
con.tables['sites'].df.iloc[3] = pd.Series({'site': 'mc04', 'lithologies': 'litho2'})
con.tables['sites'].df.head()[['lithologies']]
Out[30]:
In [31]:
con.tables['samples'].df['lithologies'] = None
con.tables['samples'].df.loc['mc01f', 'lithologies'] = 'litho3'
con.tables['samples'].df.head()[['lithologies']]
Out[31]:
In [32]:
# you should be able to run this cell over and over again without generate extra lithologies columns
# this is a fix!
for_propagation = ['lithologies']
con.propagate_cols_down(for_propagation, 'samples', 'sites')
con.tables['samples'].df.head()
Out[32]:
In [33]:
site_df.iloc[0, 1] = np.nan
site_df.iloc[1, 1] = None
site_df.location.str.split()[:5]
Out[33]:
In [34]:
import timeit
t = timeit.Timer('char in text', setup='text = "sample string"; char = "g"')
t.timeit()
t = timeit.Timer('if None: pass')
print t.timeit()
# approximately 0.04
t = timeit.Timer('if isinstance(x, type(None)): pass', setup='x=None')
print t.timeit()
# approximately 0.4
t = timeit.Timer('x = None')
print t.timeit(10000)
t = timeit.Timer('x = pd.DataFrame()', setup='import pandas as pd')
print t.timeit(10000)
In [35]:
!rm sites.txt samples.txt specimens.txt measurements.txt ages.txt contribution.txt images.txt criteria.txt locations.txt
!rm *.png
!rm *.jpg
In [ ]: