This notebook demonstrates how code from the mr-fitty package can be used to develop additional tools for working with XAFS data.
Note: when using MKL set OPM_NUM_THREADS=1.
In [1]:
import glob
import itertools
import logging
from operator import attrgetter
import os
import pprint
import sys
import time
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.misc
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as hc
from scipy.spatial.distance import pdist
import sklearn.utils
from mrfitty.base import ReferenceSpectrum
from mrfitty.base import InterpolatedSpectrumSet
logging.basicConfig(level=logging.WARN)
In [2]:
sample_data_dir_path = os.path.join(sys.prefix, 'sample_data')
print('sample data is installed at "{}"'.format(sample_data_dir_path))
os.path.exists(sample_data_dir_path)
Out[2]:
Read all sample reference and unknown spectra.
In [3]:
sample_data_reference_glob = os.path.join(sample_data_dir_path, 'reference/*als_cal*.e')
print('sample data reference glob: {}'.format(sample_data_reference_glob))
sample_data_unknown_glob = os.path.join(sample_data_dir_path, 'unknown/*.e')
print('sample data unknown glob: {}'.format(sample_data_unknown_glob))
In [4]:
sample_data_reference_list, _ = list(ReferenceSpectrum.read_all([sample_data_reference_glob]))
print('sample data reference file count: {}'.format(len(sample_data_reference_list)))
sample_data_unknown_list, _ = list(ReferenceSpectrum.read_all([sample_data_unknown_glob]))
print('sample data unknown file count: {}'.format(len(sample_data_unknown_list)))
What are the maximum and minimum reference energies?
In [5]:
reference_min_energy = np.max([r.data_df.energy.values[0] for r in sample_data_reference_list])
reference_max_energy = np.min([r.data_df.energy.values[-1] for r in sample_data_reference_list])
print('reference minimum energy: {:5.2f}'.format(reference_min_energy))
print('reference maximum energy: {:5.2f}'.format(reference_max_energy))
What are the maximum and minimum unknown spectrum energies?
In [6]:
min_energy = np.max([r.data_df.energy.values[0] for r in sample_data_unknown_list])
max_energy = np.min([r.data_df.energy.values[-1] for r in sample_data_unknown_list])
print('minimum energy: {:5.2f}'.format(min_energy))
print('maximum energy: {:5.2f}'.format(max_energy))
In [7]:
interpolate_energy_range = np.linspace(start=reference_min_energy, stop=reference_max_energy, num=200)
print('interpolate_energy_range.shape: {}'.format(interpolate_energy_range.shape))
print('interpolate_energy_range:\n{}'.format(pprint.pformat(interpolate_energy_range.tolist()[:10])))
In [8]:
# construct a single DataFrame with all interpolated reference and unknown spectra
interpolated_sample_data_reference_and_unknown_df = InterpolatedSpectrumSet.get_interpolated_spectrum_set_df(
energy_range=interpolate_energy_range,
spectrum_set=set(itertools.chain(sample_data_reference_list, sample_data_unknown_list)))
interpolated_sample_data_reference_and_unknown_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
interpolated_sample_data_reference_and_unknown_df.head()
Out[8]:
In [9]:
# this is a helper function for cluster_with_sig_cut
def permute_row_elements(df):
for i in range(df.shape[0]):
df.values[i, :] = sklearn.utils.shuffle(df.values[i, :])
return df
# demonstrate permuting row elements
x_df = pd.DataFrame(data=np.array(range(9)).reshape((3,3)))
print('before permuting row elements:')
print(x_df.head())
permute_row_elements(x_df)
print('after permuting row elements:')
print(x_df)
In [10]:
def cluster_with_sig_cut(variable_by_sample_df, title, pdist_metric, linkage_method):
#pdist_metric = 'correlation'
distance_for_sample_pairs = pdist(X=np.transpose(variable_by_sample_df.values), metric=pdist_metric)
print('{}: {} sample pairs'.format(title, len(distance_for_sample_pairs)))
plt.figure()
plt.title(title)
plt.hist(distance_for_sample_pairs)
plt.xlabel('{} distance'.format(pdist_metric))
plt.ylabel('{} pairs'.format(variable_by_sample_df.shape))
plt.show()
resample_count = 1000
expected_distance_list = []
for i in range(resample_count):
# permute the elements of each row of variable_by_sample_df
p_variable_by_sample_df = permute_row_elements(variable_by_sample_df.copy())
p_distance_for_sample_pairs = pdist(X=np.transpose(p_variable_by_sample_df.values), metric=pdist_metric)
p_linkage_distance_variable_by_sample = hc.linkage(y=p_distance_for_sample_pairs, method=linkage_method)
p_dendrogram = hc.dendrogram(Z=p_linkage_distance_variable_by_sample, no_plot=True)
expected_distance_list.extend([d for (_, _, d, _) in p_dendrogram['dcoord']])
p = 95.0
alpha = 1.0 - p/100.0
cutoff_distance = np.percentile(expected_distance_list, q=p)
print('cutoff distance is {}'.format(cutoff_distance))
plt.figure()
plt.hist(expected_distance_list)
plt.title('dendrogram distance null distribution')
plt.show()
linkage_distance_variable_by_sample = hc.linkage(y=distance_for_sample_pairs, method=linkage_method)
plt.figure(figsize=(3.7, 7))
dendrogram = hc.dendrogram(
Z=linkage_distance_variable_by_sample,
orientation='left',
labels=variable_by_sample_df.columns)
icoords = [i for i in itertools.chain(dendrogram['icoord'])]
plt.vlines(cutoff_distance, ymin=np.min(icoords), ymax=np.max(icoords))
plt.title('{}\n{} linkage'.format(title, linkage_method))
plt.xlabel('{} distance'.format(pdist_metric))
plt.savefig(title + '.pdf', format='pdf')
plt.show()
In [13]:
for core in ['ott3',]:
ref_column_list = tuple([
c for
c in interpolated_sample_data_reference_and_unknown_df.columns
if any(['als_cal' in c, 'sln' in c, 'OA' in c])])
print('reference column list has {} elements:\n{}'.format(len(ref_column_list), pprint.pformat(ref_column_list)))
unknown_column_list = tuple([
c for
c in interpolated_sample_data_reference_and_unknown_df.columns
if core in c.lower()])
print('core {} column list has {} elements:\n{}'.format(core, len(unknown_column_list), pprint.pformat(unknown_column_list)))
unknown_interpolated_df = interpolated_sample_data_reference_and_unknown_df.loc[:, unknown_column_list]
unknown_interpolated_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
unknown_interpolated_df.head()
unknown_and_reference_column_list = tuple(itertools.chain(unknown_column_list, ref_column_list))
unknown_and_reference_interpolated_df = interpolated_sample_data_reference_and_unknown_df.loc[:, unknown_and_reference_column_list]
unknown_and_reference_interpolated_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
unknown_and_reference_interpolated_df.head()
cluster_with_sig_cut(
unknown_interpolated_df,
title='As unknown {} ({} spectra)'.format(core, unknown_interpolated_df.shape[1]),
pdist_metric='correlation',
linkage_method='complete')
cluster_with_sig_cut(
unknown_and_reference_interpolated_df,
title='As unknown {} and references ({} spectra)'.format(core, unknown_and_reference_interpolated_df.shape[1]),
pdist_metric='correlation',
linkage_method='complete')
In [ ]: