Try setting OPM_NUM_THREADS=1.
In [1]:
import glob
import itertools
import logging
from operator import attrgetter
import os
import pprint
import time
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.misc
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as hc
from scipy.spatial.distance import pdist
import sklearn.utils
from mrfitty.base import ReferenceSpectrum
from mrfitty.base import InterpolatedSpectrumSet
logging.basicConfig(level=logging.WARN)
In [2]:
iron_archived_cores_data_dir_path = '/home/jlynch/host/project/th_sln/archived_tills_for_trees_Jan_30_2017/'
os.path.exists(iron_archived_cores_data_dir_path)
Out[2]:
Read all iron spectra in the core directories.
In [3]:
iron_archived_reference_glob = os.path.join(iron_archived_cores_data_dir_path, 'Fe_references/*.e')
print('references glob: {}'.format(iron_archived_reference_glob))
iron_archived_cores_spectrum_glob = os.path.join(iron_archived_cores_data_dir_path, '*/*_Fe_XANES/*.e')
print('cores glob: {}'.format(iron_archived_cores_spectrum_glob))
In [4]:
iron_archived_reference_list, _ = list(ReferenceSpectrum.read_all([iron_archived_reference_glob]))
print('refrence count: {}'.format(len(iron_archived_reference_list)))
iron_archived_cores_spectrum_list, _ = list(ReferenceSpectrum.read_all([iron_archived_cores_spectrum_glob]))
print('core spectrum count: {}'.format(len(iron_archived_cores_spectrum_list)))
What are the maximum and minimum reference energies?
In [5]:
reference_min_energy = np.max([r.data_df.energy.values[0] for r in iron_archived_reference_list])
reference_max_energy = np.min([r.data_df.energy.values[-1] for r in iron_archived_reference_list])
print('reference minimum energy: {:5.2f}'.format(reference_min_energy))
print('reference maximum energy: {:5.2f}'.format(reference_max_energy))
What are the maximum and minimum core spectrum energies?
In [6]:
min_energy = np.max([r.data_df.energy.values[0] for r in iron_archived_cores_spectrum_list])
max_energy = np.min([r.data_df.energy.values[-1] for r in iron_archived_cores_spectrum_list])
print('minimum energy: {:5.2f}'.format(min_energy))
print('maximum energy: {:5.2f}'.format(max_energy))
In [7]:
interpolate_energy_range = np.linspace(start=7100.0, stop=7250.0, num=200)
print('interpolate_energy_range.shape: {}'.format(interpolate_energy_range.shape))
print('interpolate_energy_range:\n{}'.format(pprint.pformat(interpolate_energy_range.tolist()[:10])))
In [8]:
# interpolate references and spectra in one data frame because concatentating data frames with a
# floating point index is not working for me
interpolated_iron_archived_ref_and_cores_df = InterpolatedSpectrumSet.get_interpolated_spectrum_set_df(
energy_range=interpolate_energy_range,
spectrum_set=set(itertools.chain(iron_archived_reference_list, iron_archived_cores_spectrum_list)))
interpolated_iron_archived_ref_and_cores_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
interpolated_iron_archived_ref_and_cores_df.head()
Out[8]:
In [9]:
def permute_row_elements(df):
for i in range(df.shape[0]):
df.values[i, :] = sklearn.utils.shuffle(df.values[i, :])
return df
# demonstrate permuting row elements
x_df = pd.DataFrame(data=np.array(range(9)).reshape((3,3)))
print('before permuting row elements:')
print(x_df.head())
permute_row_elements(x_df)
print('after permuting row elements:')
print(x_df)
In [10]:
def cluster_with_sig_cut(variable_by_sample_df, title, pdist_metric, linkage_method):
#pdist_metric = 'correlation'
distance_for_sample_pairs = pdist(X=np.transpose(variable_by_sample_df.values), metric=pdist_metric)
print('{}: {} sample pairs'.format(title, len(distance_for_sample_pairs)))
plt.figure()
plt.title(title)
plt.hist(distance_for_sample_pairs)
plt.xlabel('{} distance'.format(pdist_metric))
plt.ylabel('{} pairs'.format(variable_by_sample_df.shape))
plt.show()
resample_count = 1000
expected_distance_list = []
for i in range(resample_count):
# permute the elements of each row of variable_by_sample_df
p_variable_by_sample_df = permute_row_elements(variable_by_sample_df.copy())
p_distance_for_sample_pairs = pdist(X=np.transpose(p_variable_by_sample_df.values), metric=pdist_metric)
p_linkage_distance_variable_by_sample = hc.linkage(y=p_distance_for_sample_pairs, method=linkage_method)
p_dendrogram = hc.dendrogram(Z=p_linkage_distance_variable_by_sample, no_plot=True)
expected_distance_list.extend([d for (_, _, d, _) in p_dendrogram['dcoord']])
p = 95.0
alpha = 1.0 - p/100.0
cutoff_distance = np.percentile(expected_distance_list, q=p)
print('cutoff distance is {}'.format(cutoff_distance))
plt.figure()
plt.hist(expected_distance_list)
plt.title('dendrogram distance null distribution')
plt.show()
linkage_distance_variable_by_sample = hc.linkage(y=distance_for_sample_pairs, method=linkage_method)
plt.figure(figsize=(3.7, 7))
dendrogram = hc.dendrogram(
Z=linkage_distance_variable_by_sample,
orientation='left',
labels=variable_by_sample_df.columns)
icoords = [i for i in itertools.chain(dendrogram['icoord'])]
plt.vlines(cutoff_distance, ymin=np.min(icoords), ymax=np.max(icoords))
plt.title('{}\n{} linkage'.format(title, linkage_method))
plt.xlabel('{} distance'.format(pdist_metric))
plt.savefig(title + '.pdf', format='pdf')
plt.show()
In [11]:
for core in ['OTT3', 'TG3', 'UMRB2']:
# combine core and references
ref_column_list = tuple([c for c in interpolated_iron_archived_ref_and_cores_df.columns if 'standard' in c])
print('reference column list has {} elements:\n{}'.format(len(ref_column_list), pprint.pformat(ref_column_list)))
core_column_list = tuple([c for c in interpolated_iron_archived_ref_and_cores_df.columns if core in c])
print('core {} column list has {} elements:\n{}'.format(core, len(core_column_list), pprint.pformat(core_column_list)))
core_interpolated_iron_archived_df = interpolated_iron_archived_ref_and_cores_df.loc[:, core_column_list]
core_interpolated_iron_archived_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
core_interpolated_iron_archived_df.head()
core_and_ref_column_list = tuple(itertools.chain(core_column_list, ref_column_list))
core_and_ref_interpolated_iron_archived_df = interpolated_iron_archived_ref_and_cores_df.loc[:, core_and_ref_column_list]
core_and_ref_interpolated_iron_archived_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
core_and_ref_interpolated_iron_archived_df.head()
cluster_with_sig_cut(
core_interpolated_iron_archived_df,
title='Fe core {} ({} spectra)'.format(core, core_interpolated_iron_archived_df.shape[1]),
pdist_metric='correlation',
linkage_method='complete')
cluster_with_sig_cut(
core_and_ref_interpolated_iron_archived_df,
title='Fe core {} and references ({} spectra)'.format(core, core_and_ref_interpolated_iron_archived_df.shape[1]),
pdist_metric='correlation',
linkage_method='complete')
In [12]:
# all cores
ref_column_list = tuple([c for c in interpolated_iron_archived_ref_and_cores_df.columns if 'standard' in c])
print('reference column list has {} elements:\n{}'.format(len(ref_column_list), pprint.pformat(ref_column_list)))
core_column_list = tuple([c for c in interpolated_iron_archived_ref_and_cores_df.columns if 'standard' not in c])
print('all cores column list has {} elements:\n{}'.format(core, len(core_column_list), pprint.pformat(core_column_list)))
core_interpolated_iron_archived_df = interpolated_iron_archived_ref_and_cores_df.loc[:, core_column_list]
core_interpolated_iron_archived_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
core_interpolated_iron_archived_df.head()
core_and_ref_column_list = tuple(itertools.chain(core_column_list, ref_column_list))
core_and_ref_interpolated_iron_archived_df = interpolated_iron_archived_ref_and_cores_df.loc[:, core_and_ref_column_list]
core_and_ref_interpolated_iron_archived_df.plot().legend(loc='center left', bbox_to_anchor=(1, 0.5))
core_and_ref_interpolated_iron_archived_df.head()
cluster_with_sig_cut(
core_interpolated_iron_archived_df,
title='Fe all cores({} spectra)'.format(core_interpolated_iron_archived_df.shape[1]),
pdist_metric='correlation',
linkage_method='complete')
cluster_with_sig_cut(
core_and_ref_interpolated_iron_archived_df,
title='Fe all cores and references ({} spectra)'.format(core_and_ref_interpolated_iron_archived_df.shape[1]),
pdist_metric='correlation',
linkage_method='complete')
In [ ]: