In [24]:
%matplotlib inline
# Basic python libraries.
import sys,csv,urllib,json
from collections import OrderedDict as odict
from pprint import pprint
# Numerical and graphing libraries I used.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from scipy.stats import linregress,norm,binom
from scipy.optimize import fmin,fmin_tnc,fmin_l_bfgs_b
from sklearn.preprocessing import scale
from sklearn.decomposition import NMF,PCA
# Path to my code related to Bushdid et al.
sys.path.append('/Users/rgerkin/Dropbox/science/trillion')
import trillion
In [3]:
CIDs_snitz = {} # To store Snitz et al. descriptors.
CIDs_all = {} # To store all descriptors.
# The 21 molecular descriptors "discovered" in Snitz et al.
snitz_descriptors = ['nCIR','ZM1','GNar','S1K','piPC08','MATS1v','MATS7v',]
snitz_descriptors += ['GATS1v','EEig05x','ESpm02x','ESpm03d','ESpm10d','ESpm13d','BELv3']
snitz_descriptors += ['RDF035v','G1m','G1v','G1e','G3s','R8u+','nRCOSR']
# Create dictionaries of descriptor values for molecules in Snitz using their supplemental table.
# Their supplemental table has >1000 descriptors for each molecule, which is a good fraction of what is in Dragon.
with open('data/snitz_tableS1.csv') as f:
reader = csv.reader(f)
descriptors = reader.__next__()[1:]
for line in reader:
CID = int(float(line[0]))
values = [float(_) for i,_ in enumerate(line[1:]) if descriptors[i] in snitz_descriptors]
CIDs_snitz[CID] = odict(zip(snitz_descriptors,values))
values_all = [float(_) for i,_ in enumerate(line[1:])]
CIDs_all[CID] = odict(zip(descriptors,values_all))
# The descriptors in the Snitz supplemental table are already normalized to 0.0 (min value across molecules) to 1.0 (max).
# Make a matrix of CIDs by descriptor values.
x = np.array([[CIDs_all[CID][descriptor] for descriptor in descriptors] for CID in CIDs_all.keys()])
#x = scale(x) # Scale to mean=0, variance=1.
CIDs_all_normed,CIDs_normed = {},{}
for i,CID in enumerate(CIDs_all):
CIDs_all_normed[CID] = {}
CIDs_normed[CID] = {}
for j,descriptor in enumerate(descriptors):
CIDs_all_normed[CID][descriptor] = x[i,j]
if descriptor in snitz_descriptors:
CIDs_normed[CID][descriptor] = x[i,j]
In [4]:
print("There are %d CIDs and %d molecular descriptors" % (x.shape[0],x.shape[1]))
In [5]:
# Verify that x matrix and dictionary are in register, as intended by use of OrderedDict.
print(list(list(CIDs_all.values())[0].values())[:10]) # First 10 molecular descriptors for first molecule.
print(x[0,:10]) # First 10 molecular descriptors for first molecule.
In [6]:
CIDs_snitz[8193] # Ordered list of Snitz descriptors and corresponding values for the molecule with (for example) CID 8193.
Out[6]:
In [7]:
# Load the mixture components from Bushdid et al. using their supplemental information (Bushdid-tableS1.csv).
components = trillion.load_components()
# Load the Bushdid et al. odorants, tests, and test results using their supplemental information (Bushdid-tableS2.csv).
odorants,tests,results = trillion.load_odorants_tests_results(components)
In [7]:
# Get an idea for the largest number of dimensions in the molecular descriptor data set.
pca = PCA()
pca.fit(x)
plt.plot(pca.explained_variance_ratio_.cumsum())
plt.xscale('log')
plt.xlim(0,1000)
plt.ylim(0,1.01)
plt.xlabel('Number of PCA components')
plt.ylabel('Fraction of variance explained')
Out[7]:
In [8]:
# Non-negative matrix factorization of the x matrix.
nmf_200 = NMF(n_components=200, nls_max_iter=500) # 200 determined to be reasonable in a separate analysis.
nmf_200.fit(x) # Fit the x matrix to this NMF basis.
Out[8]:
In [9]:
print("Total reconstruction error using %d components is %.2f" % (nmf_200.components_.shape[1],nmf_200.reconstruction_err_))
x_nmf_200 = nmf_200.transform(x) # Transformation of x using this NMF basis.
print("Transformed matrix has %d molecules and %d descriptor dimensions" % (x_nmf_200.shape[0],x_nmf_200.shape[1]))
In [10]:
# Create a new ordered dictionary from the NMF transformation.
CIDs_nmf_200 = {}
for i,CID in enumerate(CIDs_all):
CIDs_nmf_200[CID] = odict()
for j in range(x_nmf_200.shape[1]):
CIDs_nmf_200[CID][j] = x_nmf_200[i,j]
In [61]:
fraction_correct = {}
for test in tests.values():
if test.N > 1 and test.r != 8:
key = '%d_%d' % (test.N,test.r)
if key not in fraction_correct:
fraction_correct[key] = []
fraction_correct[key].append(test.fraction_correct(results))
variances = []
for key,values in fraction_correct.items():
values = np.array(values)
expected = 26*values.mean()*(1-values.mean())
observed = (values*26).var()
variances.append((key,expected/observed,values.mean()))
variances = sorted(variances,key=lambda x:x[1])
print('N\tr\tVar exp\Var obs\tf_correct')
for variance in variances:
N,r = variance[0].split('_')
print("%d\t%d\t%.3f\t\t%.3f" % (int(N),int(r),variance[1],variance[2]))
plt.scatter(list(zip(*variances))[2],list(zip(*variances))[1])
plt.xlabel('Fraction correct')
_ = plt.ylabel('Variance expected / Variance observed')
In [49]:
list(zip(*variances))[0]
Out[49]:
In [22]:
def compute_angles(tests,results,descriptors,weights=None,method='sum',method_param=1.0):
"""Computes angles between mixtures given descriptors to use and optional weights for each descriptor."""
angles = {10:[],20:[],30:[]}
fraction_correct = {10:[],20:[],30:[]}
for test in tests.values():
if test.N > 1:
angles[test.N].append(test.angle(descriptors,weights=weights,method=method,method_param=method_param))
fraction_correct[test.N].append((test.fraction_correct(results),test.r))
for N in 10,20,30:
angles[N] = np.array(angles[N])
fraction_correct[N] = np.array(fraction_correct[N])
return angles, fraction_correct
def compute_norms(tests,results,descriptors,weights=None,order=2,method='sum'):
"""Computes angles between mixtures given descriptors to use and optional weights for each descriptor."""
norms = {10:[],20:[],30:[]}
fraction_correct = {10:[],20:[],30:[]}
for test in tests.values():
if test.N > 1:
norms[test.N].append(test.norm(descriptors,weights=weights,order=order,method=method)/test.N)
fraction_correct[test.N].append((test.fraction_correct(results),test.r))
for N in 10,20,30:
norms[N] = np.array(norms[N])
fraction_correct[N] = np.array(fraction_correct[N])
return norms, fraction_correct
In [9]:
def get_r2(prediction,observed):
"""Compute the coefficient of determination R^2 from the data"""
ss_error = np.sum((observed-prediction)**2)
ss_total = np.sum((observed-observed.mean())**2)
return 1 - ss_error/ss_total
def f_hill(params,x,y):
a,n = params
p_d = 1.0 / (1.0 + (x/a)**(-n))
p_c = 1.0/3 + p_d*(2.0/3)
ll = binom.logpmf((y*26).astype(int),26,p_c).sum() # 26 subjects.
return -ll
def plot_correct_vs_angles(angles, fraction_correct, color_d=False):
"""Plot angles from Snitz-like (angle-based) calculations, as well as the fraction correct for the same mixtures."""
fig,ax = plt.subplots(3,2,figsize=(12,12))
N_colors = {10:'r',20:'g',30:'b'}
for i,N in enumerate([10,20,30]):
d_colors = [cm.get_cmap('rainbow',N)(int(r-1)) for r in fraction_correct[N][:,1]]
ax[i,0].scatter(angles[N],fraction_correct[N][:,0],color=d_colors if color_d else N_colors[N],label='N=%d' % N)
ax[i,0].set_xlabel('Angle distance between mixtures')
ax[i,0].set_ylabel('Fraction correctly discriminated')
min_angle = np.min(np.concatenate((angles[10],angles[20],angles[30])))
max_angle = np.max(np.concatenate((angles[10],angles[20],angles[30])))
ax[i,0].set_xlim(min_angle/1.1,max_angle*1.1)
ax[i,0].set_ylim(-0.1,1)
x = angles[N]
y = fraction_correct[N][:,0]
x = np.concatenate((x,x.min()*np.ones(100)))
y = np.concatenate((y,0.33333*np.ones(100)))
slope, intercept, _ , _, _ = linregress(x,y)
prediction = np.array([intercept + slope*angle for angle in angles[N]])
prediction[np.where(prediction>1.0)] = 1.0
#a,n = fmin_tnc(f_hill,[0.1,2],approx_grad=True,args=(x,y),bounds=((0,99),(-99,99)))[0]
#p_d = 1.0 / (1.0 + (x/a)**(-n))
#prediction = 1.0/3 + p_d*(2.0/3)
r2 = get_r2(prediction,fraction_correct[N][:,0])
r2_random = []
for _ in range(100):
random = np.random.binomial(26,prediction)/26
r2_random.append(get_r2(prediction,random))
r2_random = np.mean(r2_random)
x = np.linspace(x.min(),x.max(),100)
y = intercept + slope*x
#p_d = 1.0 / (1.0 + (x/a)**(-n))
#y = 1.0/3 + p_d*(2.0/3)
d_colors = cm.get_cmap('rainbow',N)
ax[i,0].plot(x,y,color='k' if color_d else N_colors[N],
label='slope=%.2f, r=%.2f, R^2=%.2f' % (slope,np.sqrt(r2),r2/r2_random))
#ax[i,0].set_xscale('log')
ax[i,0].legend(loc=4)
z = abs(fraction_correct[N][:,0]-prediction) / (np.sqrt(prediction*(1-prediction))/np.sqrt(26))
ax[i,1].plot(sorted(z),np.linspace(0,1,len(z)),color=N_colors[N],label='data')
ax[i,1].plot(sorted(z),1-2*(1-norm.cdf(sorted(z))),color='k',label='Null hypothesis')
ax[i,1].set_xlim(0,5)
ax[i,1].set_xlabel('Squared normalized deviation')
ax[i,1].set_ylabel('Cumulative fraction')
ax[i,1].legend()
In [20]:
# Compute inter-mixture angles (Snitz style, using the 21 Snitz descriptors) and fraction correct for the data in Bushdid et al.
angles, fraction_correct = compute_angles(tests,results,CIDs_snitz)
# Plot the results.
# r = correlation coefficient. R^2 = coefficient of determination normalized to best possible given finite data.
plot_correct_vs_angles(angles, fraction_correct, color_d=True)
In [23]:
# Compute inter-mixture angles (Snitz style, using the 21 Snitz descriptors) and fraction correct for the data in Bushdid et al.
norms, fraction_correct = compute_norms(tests,results,CIDs_snitz)
# Plot the results.
# r = correlation coefficient. R^2 = coefficient of determination normalized to best possible given finite data.
plot_correct_vs_angles(norms, fraction_correct, color_d=True)
In [171]:
angles_nmf_200, fraction_correct = compute_angles(tests,results,CIDs_nmf_200)
plot_correct_vs_angles(angles_nmf_200, fraction_correct, color_d=True)
In [186]:
norms_nmf_200, fraction_correct = compute_norms(tests,results,CIDs_nmf_200,order=1)
plot_correct_vs_angles(norms_nmf_200, fraction_correct, color_d=True)
In [188]:
def f(weights,descriptors,alpha,tests,results):
angles = {10:[],20:[],30:[]}
fraction_correct = {10:[],20:[],30:[]}
for test in tests.values():
if test.N > 1:
angles[test.N].append(test.angle(descriptors,weights=weights,method='sum'))
fraction_correct[test.N].append(test.fraction_correct(results))
r2s = []
sses = []
for N in 10,30:
angles[N] = np.array(angles[N])
fraction_correct[N] = np.array(fraction_correct[N])
x = angles[N]
y = fraction_correct[N]
x = np.concatenate((x,np.zeros(100)))
y = np.concatenate((y,0.33333*np.ones(100)))
slope, intercept, _, _, _ = linregress(x,y)
prediction = np.array([intercept + slope*angle for angle in angles[N]])
#a,n = fmin_tnc(f_hill,[0.1,2],approx_grad=True,args=(x,y),bounds=((0,99),(-99,99)))[0]
#print(N,a,n,weights)
#p_d = 1.0 / (1.0 + (x/a)**(-n))
#prediction = 1.0/3 + p_d*(2.0/3)
r2 = get_r2(prediction,fraction_correct[N])
sse = np.mean((prediction-fraction_correct[N])**2)
r2s.append(r2)
sses.append(sse)
obj = np.mean(sses) + alpha*np.abs(weights).sum()
if np.random.randint(25) == 0:
print('%.3g' % obj)#,len(np.where(np.abs(weights)>0.1)[0]),r2s)
return obj
In [152]:
fitted_weights = np.ones(len(snitz_descriptors))
fitted_weights = fmin_l_bfgs_b(f,fitted_weights,fprime=None,args=(CIDs_snitz,1e-4,tests,results),approx_grad=True,maxiter=1e6)[0]
In [189]:
def f_norm(weights,descriptors,alpha,tests,results):
angles = {10:[],20:[],30:[]}
fraction_correct = {10:[],20:[],30:[]}
for test in tests.values():
if test.N > 1:
angles[test.N].append(test.norm(descriptors,order=1,weights=weights,method='sum'))
fraction_correct[test.N].append(test.fraction_correct(results))
r2s = []
sses = []
for N in 10,30:
angles[N] = np.array(angles[N])
fraction_correct[N] = np.array(fraction_correct[N])
x = angles[N]
y = fraction_correct[N]
x = np.concatenate((x,np.zeros(100)))
y = np.concatenate((y,0.33333*np.ones(100)))
slope, intercept, _, _, _ = linregress(x,y)
prediction = np.array([intercept + slope*angle for angle in angles[N]])
#a,n = fmin_tnc(f_hill,[0.1,2],approx_grad=True,args=(x,y),bounds=((0,99),(-99,99)))[0]
#print(N,a,n,weights)
#p_d = 1.0 / (1.0 + (x/a)**(-n))
#prediction = 1.0/3 + p_d*(2.0/3)
r2 = get_r2(prediction,fraction_correct[N])
sse = np.mean((prediction-fraction_correct[N])**2)
r2s.append(r2)
sses.append(sse)
obj = np.mean(sses) + alpha*np.abs(weights).sum()
if np.random.randint(25) == 0:
print('%.3g' % obj)#,len(np.where(np.abs(weights)>0.1)[0]),r2s)
return obj
In [160]:
fitted_weights
Out[160]:
In [170]:
fitted_angles, fraction_correct = compute_angles(tests,results,CIDs_snitz,weights=fitted_weights)
plot_correct_vs_angles(fitted_angles, fraction_correct, color_d=True)
In [190]:
fitted_weights_nmf_200 = fmin_l_bfgs_b(f_norm,np.ones(200),fprime=None,args=(CIDs_nmf_200,1e-4,tests,results),approx_grad=True,maxiter=1e6)[0]
In [192]:
fitted_norms_nmf_alpha, fraction_correct = compute_norms(tests,results,CIDs_nmf_200,order=1,weights=fitted_weights_nmf_200)
plot_correct_vs_angles(fitted_norms_nmf_alpha, fraction_correct, color_d=True)
In [19]:
fitted_angles_nmf_alpha, fraction_correct = compute_angles(tests,results,CIDs_nmf_200,weights=fitted_weights_nmf_200)
plot_correct_vs_angles(fitted_angles_nmf_alpha, fraction_correct, color_d=True)
In [20]:
plt.hist(np.abs(fitted_weights_nmf_200),normed=True,cumulative=-1,bins=99999,histtype='step')
plt.xlim(3.5e-5,1)
plt.ylim(0,1)
plt.xscale('log')
plt.xlabel('Coefficient absolute magnitude')
_ = plt.ylabel('Cumulative fraction')
In [21]:
print("Of the 200 coefficients, only %d have absolute value > 0.002" % (np.abs(fitted_weights_nmf_200)>2e-3).sum())
In [22]:
descriptor_indices_snitz = [descriptors.index(_) for _ in snitz_descriptors]
print("Snitz descriptor indices: %s" % descriptor_indices_snitz)
descriptor_indices_nmf = sorted(np.where(fitted_weights_nmf_200[:,np.newaxis] * nmf_200.components_ > 1.4)[1])
print("Notable NMF descriptor indices: %s" % descriptor_indices_nmf)
intersection = set(descriptor_indices_snitz).intersection(descriptor_indices_nmf)
print("They have %d descriptors in common: %s" % (len(intersection),intersection))
In [23]:
def p_match(intersection):
slots = len(snitz_descriptors)
matches = len(intersection)
p = 1
for i in range(matches):
p *= (slots-i)/(len(descriptors)-i)
for i in range(slots-matches):
p *= (len(descriptors)-slots-i)/(len(descriptors)-matches-i)
p *= trillion.get_n_combos(slots,matches)
print("Under the null hypothesis, probability of %d matches out of %d with %d total descriptors is: %g" % (matches,slots,len(descriptors),p))
p_match(intersection)
In [18]:
fake_results = []
max_angle = 0
for result in results:
if result.test.N > 1:
angle = result.test.angle(CIDs_snitz)
max_angle = max(angle,max_angle)
for result in results:
if result.test.N > 1:
angle = result.test.angle(CIDs_snitz)
p = 0.333+0.667*angle/(1.1*max_angle)
fake_correct = np.random.binomial(1,p)
fake_result = trillion.Result(result.test,result.subject_id,fake_correct)
fake_results.append(fake_result)
In [19]:
angles_fake, fraction_correct = compute_angles(tests,fake_results,CIDs_snitz)
plot_correct_vs_angles(angles_fake, fraction_correct, color_d=True)
In [25]:
fitted_weights_nmf_200_fake = fmin_l_bfgs_b(f,np.ones(200),fprime=None,args=(CIDs_nmf_200,1e-4,tests,fake_results),approx_grad=True,maxiter=1e6)[0]
In [31]:
fitted_angles_nmf_200_fake, fraction_correct = compute_angles(tests,fake_results,CIDs_nmf_200,weights=fitted_weights_nmf_200_fake)
plot_correct_vs_angles(fitted_angles_nmf_200_fake, fraction_correct, color_d=True)
In [26]:
print("Of the 200 coefficients, only %d have absolute value > 0.002" % (np.abs(fitted_weights_nmf_200_fake)>2e-3).sum())
In [27]:
descriptor_indices_snitz = [descriptors.index(_) for _ in snitz_descriptors]
print("Snitz descriptor indices: %s" % descriptor_indices_snitz)
descriptor_indices_nmf = sorted(np.where(fitted_weights_nmf_200_fake[:,np.newaxis] * nmf_200.components_ > 0.45)[1])
print("Notable NMF descriptor indices: %s" % descriptor_indices_nmf)
intersection = set(descriptor_indices_snitz).intersection(descriptor_indices_nmf)
print("They have %d descriptors in common: %s" % (len(intersection),intersection))
In [28]:
p_match(intersection)
In [141]:
# Optimize not on total correlation, but on average of correlation for each value of R (replacements).
def f2(weights,descriptors,alpha,tests,results):
angles = {}
fraction_correct = {}
for test in tests.values():
if test.N > 1:
key = '%d_%d' % (test.N,test.r)
if key not in fraction_correct:
angles[key] = []
fraction_correct[key] = []
angles[key].append(test.angle(descriptors,weights=weights,method='sum'))
fraction_correct[key].append(test.fraction_correct(results))
r2s = []
sses = []
for key in angles:
if key.split('_')[1] == '20':
continue
angles[key] = np.array(angles[key])
fraction_correct[key] = np.array(fraction_correct[key])
x = np.concatenate((angles[key],np.zeros(100)))
y = np.concatenate((fraction_correct[key],0.33333*np.ones(100)))
slope, intercept, _, _, _ = linregress(x,y)
#x = angles[key]
#y = fraction_correct[key]
#a,n = fmin_tnc(f_hill,[0.1,2],approx_grad=True,args=(x,y),bounds=((0,99),(-99,99)))[0]
slope, intercept, _ , _, _ = linregress(x,y)
#p_d = 1.0 / (1.0 + (x/a)**(-n))
#prediction = 1.0/3 + p_d*(2.0/3)
prediction = np.array([intercept + slope*angle for angle in angles[key]])
r2 = get_r2(prediction,fraction_correct[key])
sse = np.mean((prediction-fraction_correct[key])**2)
r2s.append(r2)
sses.append(sse)
obj = np.mean(sses) + alpha*np.abs(weights).sum()
if np.random.randint(25) == 0:
print('%.3g' % obj)#,len(np.where(np.abs(weights)>0.1)[0]),r2s)
return obj
In [140]:
fitted_weights_R = np.ones(len(snitz_descriptors))
fitted_weights_R = fmin_l_bfgs_b(f2,fitted_weights_R,fprime=None,args=(CIDs_snitz,1e-4,tests,results),approx_grad=True,maxiter=1e6)[0]
In [138]:
fitted_weights_R
Out[138]:
In [137]:
fitted_angles_R, fraction_correct = compute_angles(tests,results,CIDs_snitz,weights=fitted_weights_R)
plot_correct_vs_angles(fitted_angles_R, fraction_correct, color_d=True)
In [40]:
fitted_angles_R, fraction_correct = compute_angles(tests,results,CIDs_snitz,weights=fitted_weights_R)
plot_correct_vs_angles(fitted_angles_R, fraction_correct, color_d=True)
In [17]:
fitted_weights_nmf_200_R = np.ones(200)
fitted_weights_nmf_200_R = fmin_l_bfgs_b(f2,fitted_weights_nmf_200_R,fprime=None,args=(CIDs_nmf_200,1e-4,tests,results),approx_grad=True,maxiter=1e6)[0]
In [18]:
fitted_angles_nmf_200_R, fraction_correct = compute_angles(tests,results,CIDs_nmf_200,weights=fitted_weights_nmf_200_R)
plot_correct_vs_angles(fitted_angles_nmf_200_R, fraction_correct, color_d=True)
In [134]:
fitted_angles_nmf_200_R, fraction_correct = compute_angles(tests,results,CIDs_nmf_200,
weights=fitted_weights_nmf_200_R,
method='sum',
method_param=1.0)
plot_correct_vs_angles(fitted_angles_nmf_200_R, fraction_correct, color_d=True)
In [119]:
p_c[7],(y*26)[7]
Out[119]:
In [ ]: