Try setting OPM_NUM_THREADS=1.
In [1]:
import glob
import itertools
import logging
from operator import attrgetter
import os
import pprint
import time
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.misc
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as hc
from scipy.spatial.distance import pdist
import sklearn.utils
from mrfitty.base import ReferenceSpectrum, Spectrum, InterpolatedReferenceSpectraSet
from mrfitty.best_subset_selection import BestSubsetSelectionFitTask
from mrfitty.combination_fit import AdaptiveEnergyRangeBuilder, AllCombinationFitTask
from mrfitty.loss import NormalizedSumOfSquares, PredictionError
logging.basicConfig(level=logging.WARN)
In [2]:
class InterpolatedSpectrumSet:
def __init__(self, spectrum_set):
self.interpolated_set_df = InterpolatedSpectrumSet.get_interpolated_spectrum_set_df(
spectrum_set=spectrum_set)
@staticmethod
def get_interpolated_spectrum_set_df(energy_range, spectrum_set):
""" Return a pandas.DataFrame of spectrum values interpolated at the specified energies.
"""
# the interpolated spectra will be len(energy_range) x len(reference_set)
interpolated_spectra = np.zeros((len(energy_range), len(spectrum_set)))
column_names = []
for i, spectrum in enumerate(sorted(list(spectrum_set), key=lambda s: s.file_name)):
column_names.append(spectrum.file_name)
interpolated_spectra[:, i] = spectrum.interpolant(energy_range)
# set values that would be extrapolated to NaN
ndx = InterpolatedSpectrumSet.get_extrapolated_value_index(
interpolated_energy=energy_range,
measured_energy=spectrum.data_df.energy.values)
# print(ndx)
interpolated_spectra[ndx, i] = np.nan
interpolated_spectra_df = pd.DataFrame(
data=interpolated_spectra,
index=energy_range,
columns=column_names)
return interpolated_spectra_df
@staticmethod
def get_extrapolated_value_index(interpolated_energy, measured_energy):
"""Return a boolean array with True indicating interpolated energies outside the measured energy range.
:param interpolated_energy (np.array)
:param measured_energy (np.array)
:returns (numpy boolean array)
"""
extrapolated_value_boolean_index = np.logical_or(
interpolated_energy < measured_energy[0],
interpolated_energy > measured_energy[-1] )
return np.where(extrapolated_value_boolean_index)
In [3]:
arsenic_archived_cores_data_dir_path = '/home/jlynch/host/project/th_sln/archived_tills_for_trees_Jan_30_2017/'
os.path.exists(arsenic_archived_cores_data_dir_path)
Out[3]:
Read all arsenic spectra in the core directories.
In [4]:
arsenic_archived_reference_glob = os.path.join(arsenic_archived_cores_data_dir_path, 'As_references/*.e')
print('references glob: {}'.format(arsenic_archived_reference_glob))
arsenic_archived_cores_spectrum_glob = os.path.join(arsenic_archived_cores_data_dir_path, '*/*_As_XANES/*.e')
print('cores glob: {}'.format(arsenic_archived_cores_spectrum_glob))
In [5]:
arsenic_archived_reference_list, _ = list(ReferenceSpectrum.read_all([arsenic_archived_reference_glob]))
print('refrence count: {}'.format(len(arsenic_archived_reference_list)))
arsenic_archived_cores_spectrum_list, _ = list(ReferenceSpectrum.read_all([arsenic_archived_cores_spectrum_glob]))
print('core spectrum count: {}'.format(len(arsenic_archived_cores_spectrum_list)))
What are the maximum and minimum reference energies?
In [6]:
reference_min_energy = np.max([r.data_df.energy.values[0] for r in arsenic_archived_reference_list])
reference_max_energy = np.min([r.data_df.energy.values[-1] for r in arsenic_archived_reference_list])
print('reference minimum energy: {:5.2f}'.format(reference_min_energy))
print('reference maximum energy: {:5.2f}'.format(reference_max_energy))
What are the maximum and minimum core spectrum energies?
In [7]:
min_energy = np.max([r.data_df.energy.values[0] for r in arsenic_archived_cores_spectrum_list])
max_energy = np.min([r.data_df.energy.values[-1] for r in arsenic_archived_cores_spectrum_list])
print('minimum energy: {:5.2f}'.format(min_energy))
print('maximum energy: {:5.2f}'.format(max_energy))
In [8]:
interpolate_energy_range = np.linspace(start=11860.0, stop=11920.0, num=200)
print('interpolate_energy_range.shape: {}'.format(interpolate_energy_range.shape))
print('interpolate_energy_range:\n{}'.format(pprint.pformat(interpolate_energy_range.tolist()[:10])))
In [9]:
# interpolate references and spectra in one data frame because concatentating data frames with a
# floating point index is not working for me
interpolated_arsenic_archived_ref_and_cores_df = InterpolatedSpectrumSet.get_interpolated_spectrum_set_df(
energy_range=interpolate_energy_range,
spectrum_set=set(itertools.chain(arsenic_archived_reference_list, arsenic_archived_cores_spectrum_list)))
interpolated_arsenic_archived_ref_and_cores_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
interpolated_arsenic_archived_ref_and_cores_df.head()
Out[9]:
In [10]:
def permute_row_elements(df):
for i in range(df.shape[0]):
df.values[i, :] = sklearn.utils.shuffle(df.values[i, :])
return df
# demonstrate permuting row elements
x_df = pd.DataFrame(data=np.array(range(9)).reshape((3,3)))
print('before permuting row elements:')
print(x_df.head())
permute_row_elements(x_df)
print('after permuting row elements:')
print(x_df)
In [11]:
def cluster_with_sig_cut(variable_by_sample_df, title, pdist_metric, linkage_method):
#pdist_metric = 'correlation'
distance_for_sample_pairs = pdist(X=np.transpose(variable_by_sample_df.values), metric=pdist_metric)
print('{}: {} sample pairs'.format(title, len(distance_for_sample_pairs)))
plt.figure()
plt.title(title)
plt.hist(distance_for_sample_pairs)
plt.xlabel('{} distance'.format(pdist_metric))
plt.ylabel('{} pairs'.format(variable_by_sample_df.shape))
plt.show()
resample_count = 1000
expected_distance_list = []
for i in range(resample_count):
# permute the elements of each row of variable_by_sample_df
p_variable_by_sample_df = permute_row_elements(variable_by_sample_df.copy())
p_distance_for_sample_pairs = pdist(X=np.transpose(p_variable_by_sample_df.values), metric=pdist_metric)
p_linkage_distance_variable_by_sample = hc.linkage(y=p_distance_for_sample_pairs, method=linkage_method)
p_dendrogram = hc.dendrogram(Z=p_linkage_distance_variable_by_sample, no_plot=True)
expected_distance_list.extend([d for (_, _, d, _) in p_dendrogram['dcoord']])
p = 95.0
alpha = 1.0 - p/100.0
cutoff_distance = np.percentile(expected_distance_list, q=p)
print('cutoff distance is {}'.format(cutoff_distance))
plt.figure()
plt.hist(expected_distance_list)
plt.title('dendrogram distance null distribution')
plt.show()
linkage_distance_variable_by_sample = hc.linkage(y=distance_for_sample_pairs, method=linkage_method)
plt.figure(figsize=(3.75, 7.0))
dendrogram = hc.dendrogram(
Z=linkage_distance_variable_by_sample,
orientation='left',
labels=variable_by_sample_df.columns)
icoords = [i for i in itertools.chain(dendrogram['icoord'])]
plt.vlines(cutoff_distance, ymin=np.min(icoords), ymax=np.max(icoords))
dendrogram_title = '{}\n{} linkage'.format(title, linkage_method)
plt.title(dendrogram_title)
plt.xlabel('{} distance'.format(pdist_metric))
plt.savefig(title + '.pdf', format='pdf')
plt.show()
In [12]:
for core in ['OTT3', 'TG3', 'UMRB2']:
# combine core and references
ref_column_list = tuple([c for c in interpolated_arsenic_archived_ref_and_cores_df.columns if 'standard' in c])
print('reference column list has {} elements:\n{}'.format(len(ref_column_list), pprint.pformat(ref_column_list)))
core_column_list = tuple([c for c in interpolated_arsenic_archived_ref_and_cores_df.columns if core in c])
print('core {} column list has {} elements:\n{}'.format(core, len(core_column_list), pprint.pformat(core_column_list)))
core_interpolated_arsenic_archived_df = interpolated_arsenic_archived_ref_and_cores_df.loc[:, core_column_list]
core_interpolated_arsenic_archived_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
core_interpolated_arsenic_archived_df.head()
core_and_ref_column_list = tuple(itertools.chain(core_column_list, ref_column_list))
core_and_ref_interpolated_arsenic_archived_df = interpolated_arsenic_archived_ref_and_cores_df.loc[:, core_and_ref_column_list]
core_and_ref_interpolated_arsenic_archived_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
core_and_ref_interpolated_arsenic_archived_df.head()
cluster_with_sig_cut(
core_interpolated_arsenic_archived_df,
title='core {} ({} spectra)'.format(core, core_interpolated_arsenic_archived_df.shape[1]),
pdist_metric='correlation',
linkage_method='complete')
cluster_with_sig_cut(
core_and_ref_interpolated_arsenic_archived_df,
title='core {} and references ({} spectra)'.format(core, core_and_ref_interpolated_arsenic_archived_df.shape[1]),
pdist_metric='correlation',
linkage_method='complete')
In [13]:
# all cores
ref_column_list = tuple([c for c in interpolated_arsenic_archived_ref_and_cores_df.columns if 'standard' in c])
print('reference column list has {} elements:\n{}'.format(len(ref_column_list), pprint.pformat(ref_column_list)))
core_column_list = tuple([c for c in interpolated_arsenic_archived_ref_and_cores_df.columns if 'standard' not in c])
print('all cores column list has {} elements:\n{}'.format(core, len(core_column_list), pprint.pformat(core_column_list)))
core_interpolated_arsenic_archived_df = interpolated_arsenic_archived_ref_and_cores_df.loc[:, core_column_list]
core_interpolated_arsenic_archived_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
core_interpolated_arsenic_archived_df.head()
core_and_ref_column_list = tuple(itertools.chain(core_column_list, ref_column_list))
core_and_ref_interpolated_arsenic_archived_df = interpolated_arsenic_archived_ref_and_cores_df.loc[:, core_and_ref_column_list]
core_and_ref_interpolated_arsenic_archived_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
core_and_ref_interpolated_arsenic_archived_df.head()
cluster_with_sig_cut(
core_interpolated_arsenic_archived_df,
title='As all cores({} spectra)'.format(core_interpolated_arsenic_archived_df.shape[1]),
pdist_metric='correlation',
linkage_method='complete')
cluster_with_sig_cut(
core_and_ref_interpolated_arsenic_archived_df,
title='As all cores and references ({} spectra)'.format(core_and_ref_interpolated_arsenic_archived_df.shape[1]),
pdist_metric='correlation',
linkage_method='complete')
In [ ]: