In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

import latools as la

from IPython.display import HTML

from comparison_tools import helpers, stats_1sample, plots_1sample

%matplotlib inline

Data Collection Parameters


In [2]:
HTML(filename="./Parameter_Tables/iolite_data.html")


Out[2]:
Parameter Value
Laser Instrument Photon M G2 System
Energy 1.48 J/cm3
Wavelength 193 nm
Pulse Length nanosecond
Repetition Rate 4 Hz
Spot Size 35x35 μm square
Interface Sample Cell Type 2 Volume Helex
Gas Flows 1 L min-1 He
8 ml min-1 N2
Transfer Devices Squid, particle filter
Mass Spec Type Thermo iCAP-Q Quadrupole ICPM-MS
11B 50 ms
24Mg 40 ms
25Mg 70 ms
27Al 30 ms
43Ca 80 ms
55Mn 30 ms
57Fe 20 ms
63Cu 20 ms
66Zn 20 ms
88Sr 50 ms
Cycle Time 410 ms

Import Data


In [3]:
# define data format description so it can be imported by latools
dataformat = {'genfromtext_args': {'delimiter': ',',
                                   'skip_header': 15},
              'column_id': {'name_row': 13,
                            'delimiter': ',',
                            'timecolumn': 0,
                            'pattern': '([0-9]{1,3}[A-z]{1,2})'},
              'meta_regex': {0: (['name', 'date'],
                                 '([A-z0-9-]+):([0-9/ :AMP]+);')}
             }

In [4]:
dat = la.analyse('raw_data/iolite_data', internal_standard='43Ca', srm_identifier='NIST610',
                 dataformat=dataformat, names='metadata_names')


Loading Data:   5%|▍         | 11/225 [00:00<00:01, 107.76it/s]
Starting analysis using "UCD-AGILENT" configuration:
Loading Data: 100%|██████████| 225/225 [00:00<00:00, 314.09it/s]
  225 Data Files Loaded: 15 standards, 210 samples
  Analytes: 11B 24Mg 25Mg 27Al 43Ca 55Mn 57Fe 63Cu 66Zn 88Sr
  Internal Standard: 43Ca


In [5]:
sample = '1308H1-1e'

In [6]:
dat.data[sample].tplot()  # view raw data


Out[6]:
(<Figure size 720x288 with 1 Axes>,
 <matplotlib.axes._axes.Axes at 0x7f7c41b67438>)

Data Processing


In [7]:
# despiking
dat.despike(noise_despiker=True, win=3)


Despiking: 100%|██████████| 225/225 [00:00<00:00, 1375.82it/s]

In [8]:
dat.data[sample].tplot()  # view despiked data


Out[8]:
(<Figure size 720x288 with 1 Axes>,
 <matplotlib.axes._axes.Axes at 0x7f7c416760b8>)

In [9]:
dat.autorange(on_mult=[3,.6], off_mult=[.5,3])


AutoRange: 100%|██████████| 225/225 [00:04<00:00, 58.09it/s]

*****************************************
                 WARNING
*****************************************
Autorange failed for some samples:
  1308H1-13c : 58.7
  1308H1-7a  : 59.5

*** THIS IS NOT NECESSARILY A PROBLEM ***
But please check the plots below to make
sure they look OK. Failures are marked by
dashed vertical red lines.

To examine an autorange failure in more
detail, use the `autorange_plot` method
of the failing data object, e.g.:
dat.data['Sample'].autorange_plot(params)
*****************************************


In [10]:
dat.data[sample].tplot(ranges=True)  # view autorange info


Out[10]:
(<Figure size 720x288 with 1 Axes>,
 <matplotlib.axes._axes.Axes at 0x7f7c403101d0>)

In [11]:
dat.bkg_calc_weightedmean(weight_fwhm=1000, bkg_filter=True)

In [12]:
fig, ax = dat.bkg_plot()


Plotting backgrounds: 100%|██████████| 10/10 [00:15<00:00,  2.29s/it]

In [13]:
dat.bkg_subtract()


Background Subtraction: 100%|██████████| 225/225 [00:06<00:00, 33.31it/s]

In [14]:
dat.data[sample].tplot(ranges=True)  # view background subtracted data


Out[14]:
(<Figure size 720x288 with 1 Axes>,
 <matplotlib.axes._axes.Axes at 0x7f7c2ced2c50>)

In [15]:
dat.ratio('43Ca')


Ratio Calculation: 100%|██████████| 225/225 [00:00<00:00, 265.28it/s]

In [16]:
dat.data[sample].tplot(ranges=True)  # view ratio data


Out[16]:
(<Figure size 720x288 with 1 Axes>,
 <matplotlib.axes._axes.Axes at 0x7f7c2a508be0>)

In [17]:
dat.calibrate(srms_used=['NIST610'])


Applying Calibrations: 100%|██████████| 225/225 [00:06<00:00, 32.88it/s]

In [18]:
_ = dat.calibration_plot()


Filtering


In [19]:
dat.filter_clear()

In [20]:
_ = dat.crossplot()


Drawing Plots: 100%|██████████| 36/36 [00:00<00:00, 291.39it/s]

Crossplot shows two clear contaminant clusters:

  1. Elevated Fe and Mn
  2. Elevated Al and Zn

Both are likely clays, and can be removed with clustering filters.


In [21]:
dat.filter_clustering(['55Mn', '57Fe'], method='kmeans', n_clusters=2, level='population')

dat.filter_clustering(['27Al', '66Zn'], method='kmeans', n_clusters=2, level='population')


Applying 55Mn_57Fe_kmeans classifier: 100%|██████████| 210/210 [00:00<00:00, 1446.27it/s]
Applying 27Al_66Zn_kmeans classifier: 100%|██████████| 210/210 [00:00<00:00, 1148.71it/s]

In [22]:
dat.filter_on('_0', show_status=True)


Subset: All Samples

n  Filter Name           11B    24Mg   25Mg   27Al   43Ca   55Mn   57Fe   63Cu   66Zn   88Sr   
0  55Mn_57Fe_kmeans_0    True   True   True   True   True   True   True   True   True   True   
1  55Mn_57Fe_kmeans_1    False  False  False  False  False  False  False  False  False  False  
2  27Al_66Zn_kmeans_0    True   True   True   True   True   True   True   True   True   True   
3  27Al_66Zn_kmeans_1    False  False  False  False  False  False  False  False  False  False  


In [23]:
_ = dat.crossplot(filt=True)


Drawing Plots: 100%|██████████| 36/36 [00:00<00:00, 223.46it/s]

There are still a lot of points elevated in Al, Mn, Fe and Zn, which tend to be associated with clay contaminants.

If associated with contaminants, they also likely have non-static signals. Let's have a look at the gradient crossplot for these elements.


In [24]:
_ = dat.gradient_crossplot(['27Al', '55Mn', '57Fe', '66Zn'], win=5, filt=True)


Calculating Gradients: 100%|██████████| 210/210 [00:04<00:00, 43.23it/s]
Drawing Plots: 100%|██████████| 6/6 [00:00<00:00, 310.94it/s]

Al and Fe seem to have some highly variable bits - remove the 5% most variable bits with a gradient threshold filter.


In [25]:
dat.filter_gradient_threshold_percentile('27Al', [2.5, 92.5], win=5, filt=True)


Calculating Gradients: 100%|██████████| 210/210 [00:01<00:00, 117.13it/s]
Percentile Threshold Filter: 100%|██████████| 210/210 [00:00<00:00, 233.22it/s]

In [26]:
dat.filter_gradient_threshold_percentile('57Fe', [2.5, 97.5], win=5, filt=True)


Calculating Gradients: 100%|██████████| 210/210 [00:02<00:00, 79.90it/s]
Percentile Threshold Filter: 100%|██████████| 210/210 [00:01<00:00, 197.63it/s]

In [27]:
dat.filter_gradient_threshold_percentile('66Zn', [2.5, 97.5], win=5, filt=True)


Calculating Gradients: 100%|██████████| 210/210 [00:03<00:00, 68.48it/s]
Percentile Threshold Filter: 100%|██████████| 210/210 [00:00<00:00, 233.11it/s]

In [28]:
# dat.filter_gradient_threshold_percentile('11B', [2.5, 97.5], win=5, filt=True)

In [29]:
dat.filter_on('27Al_2.5-95.0-grd-pcnt_inside')
dat.filter_on('57Fe_2.5-95.0-grd-pcnt_inside')
dat.filter_on('11B_2.5-97.5-grd-pcnt_inside', show_status=True)


Subset: All Samples

n  Filter Name                       11B    24Mg   25Mg   27Al   43Ca   55Mn   57Fe   63Cu   66Zn   88Sr   
0  55Mn_57Fe_kmeans_0                True   True   True   True   True   True   True   True   True   True   
1  55Mn_57Fe_kmeans_1                False  False  False  False  False  False  False  False  False  False  
2  27Al_66Zn_kmeans_0                True   True   True   True   True   True   True   True   True   True   
3  27Al_66Zn_kmeans_1                False  False  False  False  False  False  False  False  False  False  
4  27Al_2.5-92.5-grd-pcnt_inside     True   True   True   True   True   True   True   True   True   True   
5  27Al_2.5-92.5-grd-pcnt_outside    False  False  False  False  False  False  False  False  False  False  
6  57Fe_2.5-97.5-grd-pcnt_inside     True   True   True   True   True   True   True   True   True   True   
7  57Fe_2.5-97.5-grd-pcnt_outside    False  False  False  False  False  False  False  False  False  False  
8  66Zn_2.5-97.5-grd-pcnt_inside     True   True   True   True   True   True   True   True   True   True   
9  66Zn_2.5-97.5-grd-pcnt_outside    False  False  False  False  False  False  False  False  False  False  


In [30]:
fig, axs = dat.gradient_crossplot(filt=True, mode='scatter', win=5)


Calculating Gradients: 100%|██████████| 210/210 [00:15<00:00, 13.65it/s]
Drawing Plots: 100%|██████████| 36/36 [00:00<00:00, 47.97it/s]

Everything looks a lot cleaner now... what about the concentrations?


In [31]:
fig, axs = dat.crossplot(filt=True, mode='scatter')


Drawing Plots: 100%|██████████| 36/36 [00:00<00:00, 48.80it/s]

There are a few wild outliers - remove them using concentration thresholds.


In [32]:
dat.filter_threshold('27Al', 0.5e-3)  # this will remove quite a few data points, but is a very high Al/Ca for forams
dat.filter_threshold('66Zn', 150e-6)  # | 
dat.filter_threshold('63Cu', 100e-6)  # | - to remove v high values
dat.filter_threshold('55Mn', 90e-6)   # |
dat.filter_threshold('11B', 500e-6)  # To remove a few flyers


Threshold Filter: 100%|██████████| 210/210 [00:00<00:00, 11288.59it/s]
Threshold Filter: 100%|██████████| 210/210 [00:00<00:00, 11284.69it/s]
Threshold Filter: 100%|██████████| 210/210 [00:00<00:00, 10437.80it/s]
Threshold Filter: 100%|██████████| 210/210 [00:00<00:00, 7938.53it/s]
Threshold Filter: 100%|██████████| 210/210 [00:00<00:00, 9495.92it/s]

In [33]:
dat.filter_on([10, 12, 14, 16, 18], show_status=True)


Subset: All Samples

n  Filter Name                       11B    24Mg   25Mg   27Al   43Ca   55Mn   57Fe   63Cu   66Zn   88Sr   
0  55Mn_57Fe_kmeans_0                True   True   True   True   True   True   True   True   True   True   
1  55Mn_57Fe_kmeans_1                False  False  False  False  False  False  False  False  False  False  
2  27Al_66Zn_kmeans_0                True   True   True   True   True   True   True   True   True   True   
3  27Al_66Zn_kmeans_1                False  False  False  False  False  False  False  False  False  False  
4  27Al_2.5-92.5-grd-pcnt_inside     True   True   True   True   True   True   True   True   True   True   
5  27Al_2.5-92.5-grd-pcnt_outside    False  False  False  False  False  False  False  False  False  False  
6  57Fe_2.5-97.5-grd-pcnt_inside     True   True   True   True   True   True   True   True   True   True   
7  57Fe_2.5-97.5-grd-pcnt_outside    False  False  False  False  False  False  False  False  False  False  
8  66Zn_2.5-97.5-grd-pcnt_inside     True   True   True   True   True   True   True   True   True   True   
9  66Zn_2.5-97.5-grd-pcnt_outside    False  False  False  False  False  False  False  False  False  False  
10 27Al_thresh_below                 True   True   True   True   True   True   True   True   True   True   
11 27Al_thresh_above                 False  False  False  False  False  False  False  False  False  False  
12 66Zn_thresh_below                 True   True   True   True   True   True   True   True   True   True   
13 66Zn_thresh_above                 False  False  False  False  False  False  False  False  False  False  
14 63Cu_thresh_below                 True   True   True   True   True   True   True   True   True   True   
15 63Cu_thresh_above                 False  False  False  False  False  False  False  False  False  False  
16 55Mn_thresh_below                 True   True   True   True   True   True   True   True   True   True   
17 55Mn_thresh_above                 False  False  False  False  False  False  False  False  False  False  
18 11B_thresh_below                  True   True   True   True   True   True   True   True   True   True   
19 11B_thresh_above                  False  False  False  False  False  False  False  False  False  False  


In [34]:
# a last look at the data
_ = dat.crossplot(filt=True)


Drawing Plots: 100%|██████████| 36/36 [00:00<00:00, 288.89it/s]

In [35]:
# and should look at all the traces individualls
# dat.trace_plots(filt=True)

In [36]:
dat.data['1308H1-10c'].tplot(filt=True)  # view ratio data


Out[36]:
(<Figure size 720x288 with 1 Axes>,
 <matplotlib.axes._axes.Axes at 0x7f7c087c8a90>)

In [37]:
dat.sample_stats(stats=['mean'])


Calculating Stats: 100%|██████████| 225/225 [00:27<00:00,  8.32it/s]

In [38]:
ld = dat.getstats() * 1e3

Save minimal export

This produces a .zip file containing all the raw data, the SRM values you used, and a record of everything you did to the data.

the zip file can be opened with latools.reproduce, to recreate your entire analysis.


In [39]:
dat.minimal_export(path='raw_data/iolite_data_export/minimal_export.zip')

Load Data


In [40]:
import re
import string

In [41]:
# ld = pd.read_csv('Cleaning test/RUN1_export/stat_export.csv', index_col=[0,1,2])
# ld *= 1e3  # convert to mmol/mol
ld = ld.loc['mean', :].dropna().reset_index()

# extract sample and replicate names
mtch = re.compile('([A-Z0-9]+)[_-]([0-9]+)([a-z]?)')\

for i, s in ld['sample'].iteritems():
    gs = mtch.match(s).groups()
    
    if gs[-1] == '':
        r = gs[1]
        s = gs[0]
    else:
        r = string.ascii_lowercase.index(gs[-1])
        s = gs[0] + '_' + gs[1]
    
    ld.loc[i, 'rep'] = r
    ld.loc[i, 'sample'] = s

ld.set_index(['sample', 'rep'], inplace=True)

In [42]:
rd = helpers.load_reference_data('iolite_reference')

In [43]:
df = rd.join(ld)

In [44]:
# no unique samples
np.unique(rd.index.levels[0]).size


Out[44]:
89

Characterise reproducibility in Reference data


In [45]:
_ = stats_1sample.pairwise_reproducibility(ld, plot=True)



In [46]:
_, rep_dists, rep_stats, _ = stats_1sample.pairwise_reproducibility(rd, plot=True)



In [47]:
rep = pd.DataFrame(rep_stats).T
rep.columns = ['50%', '95%']
rep = rep.reindex(['Mg/Ca', 'Sr/Ca', 'Ba/Ca', 'Al/Ca', 'Mn/Ca', 'Cu/Ca', 'Fe/Ca', 'Zn/Ca', 'B/Ca'])

In [48]:
rep.to_csv('reproducibility_quants.csv')

Compare


In [49]:
# _ = plots_1sample.comparison_plots(df)

In [50]:
# _ = plots_1sample.residual_plots(df)

In [51]:
from comparison_tools import plots_1sample

In [52]:
fig, axs = plots_1sample.bland_altman_plots(df, rep_stats)



In [53]:
fig.savefig('Figures/iolite_comparison.pdf')
fig.savefig('Figures/iolite_comparison.png', dpi=200)

Summary Statistics


In [54]:
stat = stats_1sample.comparison_stats(df)

In [55]:
stat.to_csv('Stats/iolite_stats.csv')