Using the Sobel angle similarity model with various descriptors to fit the data in Bushdid et al.



In [24]:

    
%matplotlib inline

# Basic python libraries.  
import sys,csv,urllib,json
from collections import OrderedDict as odict
from pprint import pprint

# Numerical and graphing libraries I used.  
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from scipy.stats import linregress,norm,binom
from scipy.optimize import fmin,fmin_tnc,fmin_l_bfgs_b
from sklearn.preprocessing import scale
from sklearn.decomposition import NMF,PCA

# Path to my code related to Bushdid et al. 
sys.path.append('/Users/rgerkin/Dropbox/science/trillion')  
import trillion

Load and format the Snitz et al. data



In [3]:

    
CIDs_snitz = {} # To store Snitz et al. descriptors.  
CIDs_all = {} # To store all descriptors.  

# The 21 molecular descriptors "discovered" in Snitz et al.  
snitz_descriptors = ['nCIR','ZM1','GNar','S1K','piPC08','MATS1v','MATS7v',]
snitz_descriptors += ['GATS1v','EEig05x','ESpm02x','ESpm03d','ESpm10d','ESpm13d','BELv3']
snitz_descriptors += ['RDF035v','G1m','G1v','G1e','G3s','R8u+','nRCOSR']

# Create dictionaries of descriptor values for molecules in Snitz using their supplemental table.  
# Their supplemental table has >1000 descriptors for each molecule, which is a good fraction of what is in Dragon.  
with open('data/snitz_tableS1.csv') as f:
    reader = csv.reader(f)
    descriptors = reader.__next__()[1:]
    for line in reader:
        CID = int(float(line[0]))
        values = [float(_) for i,_ in enumerate(line[1:]) if descriptors[i] in snitz_descriptors]
        CIDs_snitz[CID] = odict(zip(snitz_descriptors,values))
        values_all = [float(_) for i,_ in enumerate(line[1:])]
        CIDs_all[CID] = odict(zip(descriptors,values_all))
        
# The descriptors in the Snitz supplemental table are already normalized to 0.0 (min value across molecules) to 1.0 (max).  
# Make a matrix of CIDs by descriptor values.  
x = np.array([[CIDs_all[CID][descriptor] for descriptor in descriptors] for CID in CIDs_all.keys()])
#x = scale(x) # Scale to mean=0, variance=1.  
CIDs_all_normed,CIDs_normed = {},{}
for i,CID in enumerate(CIDs_all):
    CIDs_all_normed[CID] = {}
    CIDs_normed[CID] = {}
    for j,descriptor in enumerate(descriptors):
        CIDs_all_normed[CID][descriptor] = x[i,j]
        if descriptor in snitz_descriptors:
            CIDs_normed[CID][descriptor] = x[i,j]



In [4]:

    
print("There are %d CIDs and %d molecular descriptors" % (x.shape[0],x.shape[1]))









    



There are 1318 CIDs and 1433 molecular descriptors



In [5]:

    
# Verify that x matrix and dictionary are in register, as intended by use of OrderedDict.  
print(list(list(CIDs_all.values())[0].values())[:10]) # First 10 molecular descriptors for first molecule.  
print(x[0,:10]) # First 10 molecular descriptors for first molecule.









    



[0.25, 0.0, 0.4, 0.43, 0.43, 0.16, 0.04, 0.07, 0.03, 0.12]
[ 0.25  0.    0.4   0.43  0.43  0.16  0.04  0.07  0.03  0.12]



In [6]:

    
CIDs_snitz[8193] # Ordered list of Snitz descriptors and corresponding values for the molecule with (for example) CID 8193.









    Out[6]:





OrderedDict([('nCIR', 0.0), ('ZM1', 0.18), ('GNar', 0.58), ('S1K', 0.33), ('piPC08', 0.24), ('MATS1v', 0.5), ('MATS7v', 0.26), ('GATS1v', 0.27), ('EEig05x', 0.55), ('ESpm02x', 0.53), ('ESpm03d', 0.24), ('ESpm10d', 0.19), ('ESpm13d', 0.34), ('BELv3', 0.85), ('RDF035v', 0.03), ('G1m', 0.03), ('G1v', 0.02), ('G1e', 0.02), ('G3s', 0.17), ('R8u+', 0.23), ('nRCOSR', 0.0)])

Load and format the Bushdid et al. data



In [7]:

    
# Load the mixture components from Bushdid et al. using their supplemental information (Bushdid-tableS1.csv).  
components = trillion.load_components()
# Load the Bushdid et al. odorants, tests, and test results using their supplemental information (Bushdid-tableS2.csv).
odorants,tests,results = trillion.load_odorants_tests_results(components)

Quick check with PCA



In [7]:

    
# Get an idea for the largest number of dimensions in the molecular descriptor data set.  
pca = PCA()
pca.fit(x)
plt.plot(pca.explained_variance_ratio_.cumsum())
plt.xscale('log')
plt.xlim(0,1000)
plt.ylim(0,1.01)
plt.xlabel('Number of PCA components')
plt.ylabel('Fraction of variance explained')









    Out[7]:





<matplotlib.text.Text at 0x13f06d748>

Non-negative matrix factorization (NMF) of the molecular descriptor matrix



In [8]:

    
# Non-negative matrix factorization of the x matrix.  
nmf_200 = NMF(n_components=200, nls_max_iter=500) # 200 determined to be reasonable in a separate analysis.  
nmf_200.fit(x) # Fit the x matrix to this NMF basis.









    



/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/sklearn/decomposition/nmf.py:252: UserWarning: Iteration limit reached in nls subproblem.
  warnings.warn("Iteration limit reached in nls subproblem.")
/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/sklearn/decomposition/nmf.py:532: UserWarning: Iteration limit reached during fit. Solving for W exactly.
  warnings.warn("Iteration limit reached during fit. Solving for W exactly.")






    Out[8]:





NMF(beta=1, eta=0.1, init=None, max_iter=200, n_components=200,
  nls_max_iter=500, random_state=None, sparseness=None, tol=0.0001)



In [9]:

    
print("Total reconstruction error using %d components is %.2f" % (nmf_200.components_.shape[1],nmf_200.reconstruction_err_))
x_nmf_200 = nmf_200.transform(x) # Transformation of x using this NMF basis.  
print("Transformed matrix has %d molecules and %d descriptor dimensions" % (x_nmf_200.shape[0],x_nmf_200.shape[1]))









    



Total reconstruction error using 1433 components is 28.60
Transformed matrix has 1318 molecules and 200 descriptor dimensions



In [10]:

    
# Create a new ordered dictionary from the NMF transformation.  
CIDs_nmf_200 = {}
for i,CID in enumerate(CIDs_all):
    CIDs_nmf_200[CID] = odict()
    for j in range(x_nmf_200.shape[1]):
        CIDs_nmf_200[CID][j] = x_nmf_200[i,j]



In [61]:

    
fraction_correct = {}
for test in tests.values():
    if test.N > 1 and test.r != 8:
        key = '%d_%d' % (test.N,test.r)
        if key not in fraction_correct:
            fraction_correct[key] = []
        fraction_correct[key].append(test.fraction_correct(results))
        
variances = []
for key,values in fraction_correct.items():
    values = np.array(values)
    expected = 26*values.mean()*(1-values.mean())
    observed = (values*26).var()
    variances.append((key,expected/observed,values.mean()))
variances = sorted(variances,key=lambda x:x[1])
print('N\tr\tVar exp\Var obs\tf_correct')
for variance in variances:
    N,r = variance[0].split('_')
    print("%d\t%d\t%.3f\t\t%.3f" % (int(N),int(r),variance[1],variance[2]))
plt.scatter(list(zip(*variances))[2],list(zip(*variances))[1])
plt.xlabel('Fraction correct')
_ = plt.ylabel('Variance expected / Variance observed')









    



N	r	Var exp\Var obs	f_correct
10	10	0.271		0.652
10	7	0.300		0.638
30	10	0.356		0.529
30	20	0.375		0.602
10	1	0.405		0.452
20	1	0.493		0.358
20	20	0.513		0.673
20	15	0.547		0.633
20	10	0.589		0.527
20	5	0.643		0.456
10	4	0.679		0.548
30	1	0.920		0.365
30	30	1.361		0.698



In [49]:

    
list(zip(*variances))[0]









    Out[49]:





('10_10',
 '10_7',
 '30_10',
 '30_20',
 '10_1',
 '20_1',
 '20_20',
 '20_15',
 '20_10',
 '20_5',
 '10_4',
 '30_1',
 '30_30')

Functions to compute angle distance between mixtures, make plots and fits, and compute variance explained



In [22]:

    
def compute_angles(tests,results,descriptors,weights=None,method='sum',method_param=1.0):
    """Computes angles between mixtures given descriptors to use and optional weights for each descriptor."""
    angles = {10:[],20:[],30:[]}
    fraction_correct = {10:[],20:[],30:[]}
    for test in tests.values():
        if test.N > 1:
            angles[test.N].append(test.angle(descriptors,weights=weights,method=method,method_param=method_param))
            fraction_correct[test.N].append((test.fraction_correct(results),test.r))
    for N in 10,20,30:
        angles[N] = np.array(angles[N])
        fraction_correct[N] = np.array(fraction_correct[N])
    return angles, fraction_correct

def compute_norms(tests,results,descriptors,weights=None,order=2,method='sum'):
    """Computes angles between mixtures given descriptors to use and optional weights for each descriptor."""
    norms = {10:[],20:[],30:[]}
    fraction_correct = {10:[],20:[],30:[]}
    for test in tests.values():
        if test.N > 1:
            norms[test.N].append(test.norm(descriptors,weights=weights,order=order,method=method)/test.N)
            fraction_correct[test.N].append((test.fraction_correct(results),test.r))
    for N in 10,20,30:
        norms[N] = np.array(norms[N])
        fraction_correct[N] = np.array(fraction_correct[N])
    return norms, fraction_correct



In [9]:

    
def get_r2(prediction,observed):
    """Compute the coefficient of determination R^2 from the data"""
    ss_error = np.sum((observed-prediction)**2)
    ss_total = np.sum((observed-observed.mean())**2)
    return 1 - ss_error/ss_total

def f_hill(params,x,y):
    a,n = params
    p_d = 1.0 / (1.0 + (x/a)**(-n))
    p_c = 1.0/3 + p_d*(2.0/3)
    ll = binom.logpmf((y*26).astype(int),26,p_c).sum() # 26 subjects.  
    return -ll
    
def plot_correct_vs_angles(angles, fraction_correct, color_d=False):
    """Plot angles from Snitz-like (angle-based) calculations, as well as the fraction correct for the same mixtures."""
    fig,ax = plt.subplots(3,2,figsize=(12,12))
    N_colors = {10:'r',20:'g',30:'b'}
    
    for i,N in enumerate([10,20,30]):
        d_colors = [cm.get_cmap('rainbow',N)(int(r-1)) for r in fraction_correct[N][:,1]]
        ax[i,0].scatter(angles[N],fraction_correct[N][:,0],color=d_colors if color_d else N_colors[N],label='N=%d' % N)
        ax[i,0].set_xlabel('Angle distance between mixtures')
        ax[i,0].set_ylabel('Fraction correctly discriminated')
        min_angle = np.min(np.concatenate((angles[10],angles[20],angles[30])))
        max_angle = np.max(np.concatenate((angles[10],angles[20],angles[30])))
        ax[i,0].set_xlim(min_angle/1.1,max_angle*1.1)
        ax[i,0].set_ylim(-0.1,1)
        x = angles[N]
        y = fraction_correct[N][:,0]
        x = np.concatenate((x,x.min()*np.ones(100)))
        y = np.concatenate((y,0.33333*np.ones(100)))
        slope, intercept, _ , _, _ = linregress(x,y)
        prediction = np.array([intercept + slope*angle for angle in angles[N]])
        prediction[np.where(prediction>1.0)] = 1.0
        #a,n = fmin_tnc(f_hill,[0.1,2],approx_grad=True,args=(x,y),bounds=((0,99),(-99,99)))[0]
        #p_d = 1.0 / (1.0 + (x/a)**(-n))
        #prediction = 1.0/3 + p_d*(2.0/3)
        r2 = get_r2(prediction,fraction_correct[N][:,0])
        r2_random = []
        for _ in range(100):
            random = np.random.binomial(26,prediction)/26
            r2_random.append(get_r2(prediction,random))
        r2_random = np.mean(r2_random)
        x = np.linspace(x.min(),x.max(),100)
        y = intercept + slope*x
        #p_d = 1.0 / (1.0 + (x/a)**(-n))
        #y = 1.0/3 + p_d*(2.0/3)
        d_colors = cm.get_cmap('rainbow',N)
        ax[i,0].plot(x,y,color='k' if color_d else N_colors[N],
                     label='slope=%.2f, r=%.2f, R^2=%.2f' % (slope,np.sqrt(r2),r2/r2_random))
        #ax[i,0].set_xscale('log')
        ax[i,0].legend(loc=4)

        z = abs(fraction_correct[N][:,0]-prediction) / (np.sqrt(prediction*(1-prediction))/np.sqrt(26))
        ax[i,1].plot(sorted(z),np.linspace(0,1,len(z)),color=N_colors[N],label='data')
        ax[i,1].plot(sorted(z),1-2*(1-norm.cdf(sorted(z))),color='k',label='Null hypothesis')
        ax[i,1].set_xlim(0,5)
        ax[i,1].set_xlabel('Squared normalized deviation')
        ax[i,1].set_ylabel('Cumulative fraction')
        ax[i,1].legend()

Check the raw Snitz model (21 descriptors, equal weight) against the Bushdid et al. data
Left panels: x-axis is Snitz angles; y-axis is Bushdid data.
Right panels: A measure of how close the fit is to perfection (i.e. chi-squared distributed squared errors)
N=10,20,30 is number of components in a mixture.
See distinction between r and R^2 below.



In [20]:

    
# Compute inter-mixture angles (Snitz style, using the 21 Snitz descriptors) and fraction correct for the data in Bushdid et al. 
angles, fraction_correct = compute_angles(tests,results,CIDs_snitz)

# Plot the results.  
# r = correlation coefficient.  R^2 = coefficient of determination normalized to best possible given finite data.  
plot_correct_vs_angles(angles, fraction_correct, color_d=True)









    



/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/IPython/kernel/__main__.py:54: RuntimeWarning: divide by zero encountered in true_divide



In [23]:

    
# Compute inter-mixture angles (Snitz style, using the 21 Snitz descriptors) and fraction correct for the data in Bushdid et al. 
norms, fraction_correct = compute_norms(tests,results,CIDs_snitz)

# Plot the results.  
# r = correlation coefficient.  R^2 = coefficient of determination normalized to best possible given finite data.  
plot_correct_vs_angles(norms, fraction_correct, color_d=True)









    



/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/IPython/kernel/__main__.py:50: RuntimeWarning: invalid value encountered in sqrt
/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/IPython/kernel/__main__.py:54: RuntimeWarning: divide by zero encountered in true_divide

Same thing but for the descriptors extracted through NMF



In [171]:

    
angles_nmf_200, fraction_correct = compute_angles(tests,results,CIDs_nmf_200)
plot_correct_vs_angles(angles_nmf_200, fraction_correct, color_d=True)



In [186]:

    
norms_nmf_200, fraction_correct = compute_norms(tests,results,CIDs_nmf_200,order=1)
plot_correct_vs_angles(norms_nmf_200, fraction_correct, color_d=True)

An optimization function that uses L1 penalized regression (i.e. Lasso) to get the best fit for the N=10 and N=30 cases, and then tests out of sample on the N=20 cases



In [188]:

    
def f(weights,descriptors,alpha,tests,results):
    angles = {10:[],20:[],30:[]}
    fraction_correct = {10:[],20:[],30:[]}
    for test in tests.values():
        if test.N > 1:
            angles[test.N].append(test.angle(descriptors,weights=weights,method='sum'))
            fraction_correct[test.N].append(test.fraction_correct(results))
    r2s = []
    sses = []
    for N in 10,30:
        angles[N] = np.array(angles[N])
        fraction_correct[N] = np.array(fraction_correct[N])
        x = angles[N]
        y = fraction_correct[N]
        x = np.concatenate((x,np.zeros(100)))
        y = np.concatenate((y,0.33333*np.ones(100)))
        slope, intercept, _, _, _ = linregress(x,y)
        prediction = np.array([intercept + slope*angle for angle in angles[N]])
        #a,n = fmin_tnc(f_hill,[0.1,2],approx_grad=True,args=(x,y),bounds=((0,99),(-99,99)))[0]
        #print(N,a,n,weights)
        #p_d = 1.0 / (1.0 + (x/a)**(-n))
        #prediction = 1.0/3 + p_d*(2.0/3)
        r2 = get_r2(prediction,fraction_correct[N])
        sse = np.mean((prediction-fraction_correct[N])**2)
        r2s.append(r2)
        sses.append(sse)
    obj = np.mean(sses) + alpha*np.abs(weights).sum()
    if np.random.randint(25) == 0:
        print('%.3g' % obj)#,len(np.where(np.abs(weights)>0.1)[0]),r2s)
    return obj

Fit and plot the weighted Snitz descriptors (weighted according to regression above)



In [152]:

    
fitted_weights = np.ones(len(snitz_descriptors))
fitted_weights = fmin_l_bfgs_b(f,fitted_weights,fprime=None,args=(CIDs_snitz,1e-4,tests,results),approx_grad=True,maxiter=1e6)[0]



In [189]:

    
def f_norm(weights,descriptors,alpha,tests,results):
    angles = {10:[],20:[],30:[]}
    fraction_correct = {10:[],20:[],30:[]}
    for test in tests.values():
        if test.N > 1:
            angles[test.N].append(test.norm(descriptors,order=1,weights=weights,method='sum'))
            fraction_correct[test.N].append(test.fraction_correct(results))
    r2s = []
    sses = []
    for N in 10,30:
        angles[N] = np.array(angles[N])
        fraction_correct[N] = np.array(fraction_correct[N])
        x = angles[N]
        y = fraction_correct[N]
        x = np.concatenate((x,np.zeros(100)))
        y = np.concatenate((y,0.33333*np.ones(100)))
        slope, intercept, _, _, _ = linregress(x,y)
        prediction = np.array([intercept + slope*angle for angle in angles[N]])
        #a,n = fmin_tnc(f_hill,[0.1,2],approx_grad=True,args=(x,y),bounds=((0,99),(-99,99)))[0]
        #print(N,a,n,weights)
        #p_d = 1.0 / (1.0 + (x/a)**(-n))
        #prediction = 1.0/3 + p_d*(2.0/3)
        r2 = get_r2(prediction,fraction_correct[N])
        sse = np.mean((prediction-fraction_correct[N])**2)
        r2s.append(r2)
        sses.append(sse)
    obj = np.mean(sses) + alpha*np.abs(weights).sum()
    if np.random.randint(25) == 0:
        print('%.3g' % obj)#,len(np.where(np.abs(weights)>0.1)[0]),r2s)
    return obj



In [160]:

    
fitted_weights









    Out[160]:





array([  5.65719540e-07,   8.22746403e-06,   1.45508864e-01,
         1.71274337e-03,   9.21326000e-07,   4.45804471e-04,
        -2.45606797e-06,   3.12946842e-07,   7.35723239e-04,
        -4.15982583e-07,  -3.43702348e-06,  -3.45440851e-06,
        -7.04562479e-07,   4.98857687e-04,   1.01607341e-03,
         2.79587789e-04,   5.15203126e-04,   1.96971851e-06,
         3.18212652e-04,   5.22315928e-04,  -4.29162430e-04])



In [170]:

    
fitted_angles, fraction_correct = compute_angles(tests,results,CIDs_snitz,weights=fitted_weights)
plot_correct_vs_angles(fitted_angles, fraction_correct, color_d=True)

Fit and plot the weighted NMF descriptors



In [190]:

    
fitted_weights_nmf_200 = fmin_l_bfgs_b(f_norm,np.ones(200),fprime=None,args=(CIDs_nmf_200,1e-4,tests,results),approx_grad=True,maxiter=1e6)[0]



In [192]:

    
fitted_norms_nmf_alpha, fraction_correct = compute_norms(tests,results,CIDs_nmf_200,order=1,weights=fitted_weights_nmf_200)
plot_correct_vs_angles(fitted_norms_nmf_alpha, fraction_correct, color_d=True)



In [19]:

    
fitted_angles_nmf_alpha, fraction_correct = compute_angles(tests,results,CIDs_nmf_200,weights=fitted_weights_nmf_200)
plot_correct_vs_angles(fitted_angles_nmf_alpha, fraction_correct, color_d=True)

The number of significant NMF descriptors after weighting is about the same as the number of descriptors selected in Snitz et al, i.e about 20.



In [20]:

    
plt.hist(np.abs(fitted_weights_nmf_200),normed=True,cumulative=-1,bins=99999,histtype='step')
plt.xlim(3.5e-5,1)
plt.ylim(0,1)
plt.xscale('log')
plt.xlabel('Coefficient absolute magnitude')
_ = plt.ylabel('Cumulative fraction')



In [21]:

    
print("Of the 200 coefficients, only %d have absolute value > 0.002" % (np.abs(fitted_weights_nmf_200)>2e-3).sum())









    



Of the 200 coefficients, only 24 have absolute value > 0.002

How many molecular descriptors did the two methods select in commmon?



In [22]:

    
descriptor_indices_snitz = [descriptors.index(_) for _ in snitz_descriptors]
print("Snitz descriptor indices: %s" % descriptor_indices_snitz)
descriptor_indices_nmf = sorted(np.where(fitted_weights_nmf_200[:,np.newaxis] * nmf_200.components_ > 1.4)[1])
print("Notable NMF descriptor indices: %s" % descriptor_indices_nmf)
intersection = set(descriptor_indices_snitz).intersection(descriptor_indices_nmf)
print("They have %d descriptors in common: %s" % (len(intersection),intersection))









    



Snitz descriptor indices: [18, 43, 50, 95, 174, 288, 294, 320, 350, 406, 422, 429, 432, 476, 732, 993, 1004, 1015, 1039, 1199, 1294]
Notable NMF descriptor indices: [32, 42, 116, 149, 173, 174, 175, 176, 194, 369, 983, 994, 1005, 1016, 1027, 1038, 1288, 1323, 1424]
They have 1 descriptors in common: {174}



In [23]:

    
def p_match(intersection):
    slots = len(snitz_descriptors)
    matches = len(intersection)
    p = 1
    for i in range(matches):
        p *= (slots-i)/(len(descriptors)-i)
    for i in range(slots-matches):
        p *= (len(descriptors)-slots-i)/(len(descriptors)-matches-i)
    p *= trillion.get_n_combos(slots,matches)
    print("Under the null hypothesis, probability of %d matches out of %d with %d total descriptors is: %g" % (matches,slots,len(descriptors),p))
    
p_match(intersection)









    



Under the null hypothesis, probability of 1 matches out of 21 with 1433 total descriptors is: 0.231847

Not very many in common. This suggests that one of both of these methods is not selecting particularly special molecular descriptors, probably in part because the descriptors are highly correlated. This is a good reason to prefer the NMF version (or some other matrix decomposition), since the correlated needs to be removed.

Let's see how many we get in common if we construct a fake data set in which fraction discriminated is determined entirely by the angle using the Snitz descriptors and nothing else.



In [18]:

    
fake_results = []
max_angle = 0
for result in results:
    if result.test.N > 1:
        angle = result.test.angle(CIDs_snitz)
        max_angle = max(angle,max_angle)
for result in results:
    if result.test.N > 1:
        angle = result.test.angle(CIDs_snitz)
        p = 0.333+0.667*angle/(1.1*max_angle)
        fake_correct = np.random.binomial(1,p)
        fake_result = trillion.Result(result.test,result.subject_id,fake_correct)
        fake_results.append(fake_result)



In [19]:

    
angles_fake, fraction_correct = compute_angles(tests,fake_results,CIDs_snitz)
plot_correct_vs_angles(angles_fake, fraction_correct, color_d=True)



In [25]:

    
fitted_weights_nmf_200_fake = fmin_l_bfgs_b(f,np.ones(200),fprime=None,args=(CIDs_nmf_200,1e-4,tests,fake_results),approx_grad=True,maxiter=1e6)[0]



In [31]:

    
fitted_angles_nmf_200_fake, fraction_correct = compute_angles(tests,fake_results,CIDs_nmf_200,weights=fitted_weights_nmf_200_fake)
plot_correct_vs_angles(fitted_angles_nmf_200_fake, fraction_correct, color_d=True)



In [26]:

    
print("Of the 200 coefficients, only %d have absolute value > 0.002" % (np.abs(fitted_weights_nmf_200_fake)>2e-3).sum())









    



Of the 200 coefficients, only 29 have absolute value > 0.002



In [27]:

    
descriptor_indices_snitz = [descriptors.index(_) for _ in snitz_descriptors]
print("Snitz descriptor indices: %s" % descriptor_indices_snitz)
descriptor_indices_nmf = sorted(np.where(fitted_weights_nmf_200_fake[:,np.newaxis] * nmf_200.components_ > 0.45)[1])
print("Notable NMF descriptor indices: %s" % descriptor_indices_nmf)
intersection = set(descriptor_indices_snitz).intersection(descriptor_indices_nmf)
print("They have %d descriptors in common: %s" % (len(intersection),intersection))









    



Snitz descriptor indices: [18, 43, 50, 95, 174, 288, 294, 320, 350, 406, 422, 429, 432, 476, 732, 993, 1004, 1015, 1039, 1199, 1294]
Notable NMF descriptor indices: [16, 171, 179, 637, 638, 639, 841, 982, 984, 993, 995, 1004, 1006, 1015, 1017, 1026, 1028, 1037, 1039, 1281, 1358, 1405]
They have 4 descriptors in common: {1039, 993, 1004, 1015}



In [28]:

    
p_match(intersection)









    



Under the null hypothesis, probability of 4 matches out of 21 with 1433 total descriptors is: 0.000166846

A few more matches, and enough to be significantly different from chance, but still not that impressive



In [141]:

    
# Optimize not on total correlation, but on average of correlation for each value of R (replacements).
def f2(weights,descriptors,alpha,tests,results):
    angles = {}
    fraction_correct = {}
    for test in tests.values():
        if test.N > 1:
            key = '%d_%d' % (test.N,test.r)
            if key not in fraction_correct:
                angles[key] = []
                fraction_correct[key] = []
            angles[key].append(test.angle(descriptors,weights=weights,method='sum'))
            fraction_correct[key].append(test.fraction_correct(results))
    r2s = []
    sses = []
    for key in angles:
        if key.split('_')[1] == '20':
            continue
        angles[key] = np.array(angles[key])
        fraction_correct[key] = np.array(fraction_correct[key])
        x = np.concatenate((angles[key],np.zeros(100)))
        y = np.concatenate((fraction_correct[key],0.33333*np.ones(100)))
        slope, intercept, _, _, _ = linregress(x,y)
        #x = angles[key]
        #y = fraction_correct[key]
        #a,n = fmin_tnc(f_hill,[0.1,2],approx_grad=True,args=(x,y),bounds=((0,99),(-99,99)))[0]
        slope, intercept, _ , _, _ = linregress(x,y)
        #p_d = 1.0 / (1.0 + (x/a)**(-n))
        #prediction = 1.0/3 + p_d*(2.0/3)
        prediction = np.array([intercept + slope*angle for angle in angles[key]])
        r2 = get_r2(prediction,fraction_correct[key])
        sse = np.mean((prediction-fraction_correct[key])**2)
        r2s.append(r2)
        sses.append(sse)
    obj = np.mean(sses) + alpha*np.abs(weights).sum()
    if np.random.randint(25) == 0:
        print('%.3g' % obj)#,len(np.where(np.abs(weights)>0.1)[0]),r2s)
    return obj



In [140]:

    
fitted_weights_R = np.ones(len(snitz_descriptors))
fitted_weights_R = fmin_l_bfgs_b(f2,fitted_weights_R,fprime=None,args=(CIDs_snitz,1e-4,tests,results),approx_grad=True,maxiter=1e6)[0]









    



0.505198980593 0.174516758536
0.207040191001 2.0210043491
0.353824371288 1.96790941487
0.22850010132 1.61611113516
1.48089143721e-05 -0.0195269612716
0.194195220079 1.44487256643
0.615590605868 0.699719634519
0.209676381692





    



/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/IPython/kernel/__main__.py:9: RuntimeWarning: divide by zero encountered in true_divide
/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/IPython/kernel/__main__.py:5: RuntimeWarning: divide by zero encountered in double_scalars






    



 0.40091115572
0.122972340616 1.39660782995
0.428417260881 0.625036217364
0.0323426950774 0.089656056357
0.223413737802 2.01372012129
0.539608768777 0.165936339731
0.207096711587 2.02101543452
0.353809780057 1.96791125967
0.228496950687 1.61609492668
0.319702351605 0.293134273469
0.32227636036 0.945921935419
0.436669247292 0.837752401578
0.209676381692 0.40091115572
0.122973647174 1.39658247003
0.517645403431 0.552803019128
0.0318261104401 0.0880556125639
0.223416140275 2.01371791777
0.544416187606 0.15909690004
0.207094293119 2.02101495381
0.353824370476 1.96790941161
0.228500651111 1.61610892199
0.405046230693 0.219760694208
0.194705842493 1.43930765651
0.610704479229 0.701688890725
0.207805510656 0.391778718091
0.122973645809 1.39658250609
0.399923538098 0.645752194872
0.0362449778816 0.10348201539
0.223521903372 2.01376002352
2.05402704552e-05 -0.0259123946993
0.207025465581 2.02100145955
0.353809779647 1.9679112613
0.228500662012 1.61612212369
0.405147789138 0.219593437756
0.3223462363 0.945783287576
0.495750890957 0.779115155217
0.197237293056 0.345893999281
0.122976624196 1.39649282704
0.484366327467 0.576427287318
0.0995450624352 0.0901243645749
0.0651050776954 27.1484277483
0.528165763644 0.168076682812
0.207099128453 2.02101590294
0.353809780052 1.96791126293
0.228500477455 1.61611467508
0.319701983466 0.293429155678
0.321632355166 0.946929583403
0.614531120901 0.700146816925
0.196017407011 0.341145006261
0.122979134102 1.39647573201
0.25583853148 0.943276430361
0.0217509125795 0.0712525046217
0.178491510717 2.23872217119
8.90461740255e-06 -0.0239497612087
0.207059755314 2.02100818787
0.353812733026 1.96791088786
0.228498089791 1.61614031496
0.319758926344 0.293367464256
0.19327633099 1.45461107355
0.615031531498 0.699886257666
0.255712983963 0.828720043137
0.122973784321 1.39657978543
0.517261615337 0.553246526133
0.0996806449219 0.104394232755
0.0707809300001 12.0200522038
0.53954761014 0.165949382035
0.207020637067 2.02100052402
0.353817567985 1.96791027668
0.228500690442 1.61611424524
0.31985426169 0.293164661331
0.194171814163 1.44500171815
0.61433084988 0.700070217538
0.194099138921 0.333878328752
0.122960509623 1.39684005162
0.51699759051 0.553368837536
0.0997738083512 0.114202621087
0.0750867846463 8.41677445523
0.539508908428 0.165953631774
0.207067236755 2.02100965011
0.353832159703 1.96790843177
0.228500801252 1.61608207767
0.319833515783 0.29323758045
0.194465096224 1.44190613731
0.459937796903 0.799851537269
0.260766855479 0.926166492815
0.122973694392 1.396581563
0.517232336732 0.553261928707
0.0310172704322 0.0857058921688
0.0650712472079 27.429522236
3.38326225346e-06 -0.0212577629862
0.207094293299 2.02101496612
0.353799128034 1.96791261128
0.228501883607 1.61589019485
0.308283199207 0.29814521994
0.224258553315 1.19587286658
0.506780673002 0.774866028888
0.194184897426 0.334198754677
0.122973841208 1.39657429279
0.517439515233 0.553122533266
0.0999165976188 0.135102845242
0.124794732732 3.00625926914
4.57158833456e-07 -0.0179634746153
0.207037776704 2.02100387504
0.353806826337 1.9679116331
0.22850063063 1.61611204976
1.05552425822e-06 -0.0154908847623
0.321519533523 0.946477313183
0.567530209362 0.723885964104
0.251574185514 0.761882844057
0.122998834146 1.39608737598
0.241933040085 0.942685798433
0.0344456970382 0.0868231135956
0.22395677272 2.01387838586
9.11551815364e-07 -0.0195150623238
0.207059752165 2.02100818172
0.35382634029 1.96790916588
0.228500717328 1.61610642606
0.319689803618 0.293444656016
0.322178701289 0.946103770583
0.495423010872 0.773831019367
0.20829312915 0.394126399951
0.122974129908 1.39657309476
0.522688871217 0.549775794057
0.0330114370698 0.0933109208496
0.0755874761865 7.54989518278
0.53114216559 0.167023533211
0.207045024511 2.02100529727
0.353827325816 1.96790904628
0.228500687458 1.61611158572
0.196884553983 1.38590740395
0.322294389777 0.945885191263
0.333717026787 0.981395020676
0.247505844515 0.262754672291
0.123005096701 1.39598533886
0.28384211391 0.836661424987
0.100880949334 0.220964186612
0.091260918897 4.9974627262
1.06361590113e-05 -0.02410600975
0.207074719508 2.02101113102
0.353819536067 1.96791002771
0.228500736651 1.61609977271
0.319679170634 0.293459679689
0.322361324023 0.945752239564
0.615324611231 0.699729189998
0.189055611029 0.315866515751
0.122969140697 1.39660857477
0.517200720826 0.553281748558
0.100162352397 0.147814842398
0.223957652679 2.01387763128
0.0612747636297 -0.287649379418
0.207099362755 2.02101597317
0.353844783785 1.96790683242
0.228501551739 1.61591115554
0.319372702704 0.293780633636
0.322206509991 0.945070841775
0.402296549636 0.859548932553
0.227938662349 0.149099278058
0.122972437289 1.39660670984
0.516932825595 0.553456616061
0.0673912494576 0.17290889957
0.223447335115 2.01373956812
0.539463212244 0.165967301623
0.207089455339 2.02101401707
0.35382633988 1.96790916751
0.228500701675 1.6161106681
0.354575457873 0.254461073898
0.32235720471 0.945766229989
0.342257668277 0.95761058098
0.196485744515 0.342956106482
0.122973091291 1.39659330911
0.51595464844 0.553814175836
0.10000189613 0.127882814855
0.0831988671357 6.09468375733
0.0570177325091 -0.203396173817
0.207109034425 2.02101784708
0.353788388355 1.9679139675
0.228501012072 1.6162162562
0.406333177173 0.219094637898
0.194276431676 1.44393468383
0.64102237937 0.687544070664
0.208647157374 0.395845566756
0.122974444163 1.39657372019
0.429035424149 0.617032425215
0.0374411468294 0.103807475024
0.223522746612 2.01375935407
0.539501283044 0.165958219208
0.20705008607 2.02100629109
0.353839948606 1.96790744372
0.22850070131 1.61611116257
0.319757358918 0.293359674605
0.322237602018 0.945966358497
0.666934410802 0.674437510838
0.231707962022 0.153322375489
0.122977198175 1.39651907465
0.511593615317 0.556238281778
0.100775805751 0.206972262295
0.16557072942 2.34785530853
0.549247139406 0.162497764791
0.207069653912 2.02101012443
0.353811748833 1.96791101071
0.22850069748 1.61611254845
0.312203877492 0.305126472991
0.322398418457 0.945688115964
0.614953787884 0.699930451332
0.210609304274 0.405594039889
0.122973661705 1.396582188
0.51633259452 0.553841144148
0.0346525258851 0.0960943710985
0.164833977723 2.35674660958
0.47138766823 0.18217330513
0.207086807986 2.02101349699
0.353818551821 1.96791015056
0.228504031765 1.6163951576
0.084924148907 -0.243747107875
0.311379497718 0.965094514502
0.614995928344 0.699899902981
0.194740502719 0.336281126099
0.12299452975 1.39617453923
0.51728861562 0.553195657847
0.0997672389199 0.106048079574
0.15277128052 2.47497982481
0.496329892807 0.175672928287
0.207109037485 2.02101784707
0.353796174568 1.96791298142
0.228500731648 1.61610228976
0.404692068743 0.219785455876
0.322178942229 0.946105779867
0.479836473397 0.785663471124
0.279358576634 1.57989720632
0.122972486483 1.3966050331
0.521284944667 0.550943198104
0.00832947459574 0.0691947874998
0.126257635116 2.95548709286
0.538775885186 0.16610372853
0.207087040522 2.02101355489
0.353826341103 1.96790916751
0.228500681898 1.61611641757
0.0880081964646 -0.260110080198
0.317447323604 0.956680717493
0.54126754357 0.741805965425
0.221032919118 0.464652683303
0.122974123619 1.39657328609
0.51718539964 0.553292434535
0.0427647321673 0.105520979715
0.227863259944 2.01491337919
0.53938225809 0.165984543735
0.207042608466 2.02100482317
0.353838964196 1.96790756986
0.22850666067 1.61603398205
0.319701442316 0.293430299675
0.322864471661 0.943916612614
0.608231334171 0.703717301399
0.20972081432 0.401132124845
0.122982025863 1.39641993878
0.516557090376 0.553739382023
0.100178255713 0.14802130836
0.223608914482 2.01377990135
0.199997968754 0.428599782325
0.207054920385 2.02100723329
0.353831175363 1.96790855301
0.22850191519 1.61617346045
2.69493799721e-06 -0.0168349780911
0.322265936607 0.945939210607
0.306211271902 1.01165999654
0.209676381692 0.40091115572
0.122973329825 1.39658876772
0.517106411373 0.553343474557
0.0107743659444 0.0740100364304
0.228135198151 2.01497799355
0.361085062349 0.234657305402
0.208272300899 2.02096877049
0.353881937025 1.96871510778
0.22910402835 1.73901293564
0.231196340485 0.588342919476
0.192938980162 1.46728286791
1.10079493208 0.55291356545
0.243451753369 0.246689838014
0.131110799813 1.26101909726
0.574120763777 0.522145677113
0.0982449721252 0.105404627463
0.115900524976 4.80575780331
0.352874299509 0.238885987602
0.208302380196 2.02097459585
0.353903033019 1.96871250815
0.229104037695 1.73902019588
0.231196434468 0.588342079372
0.357616072176 0.888987732712
0.255906324546 1.19996710953
0.270652089737 0.200116227913
0.131085536203 1.261384862
0.537947494602 0.540758030163
0.0984294976585 0.11580919668
0.15583716787 2.56499086002
0.368795657216 0.232686085737
0.208299937912 2.02097412317
0.353882881453 1.96871499145
0.229104142636 1.73900850415
0.323153595745 0.32765849419
0.356996691772 0.88972394884
0.772691632228 0.633820796548
0.188261965834 0.337739268337
0.131124554762 1.26081886444
0.477053138774 0.578333825618
0.105921248051 0.0237632640527
0.236051355307 2.02044181922
0.552414751879 0.167734446153
0.208295058486 2.02097319029
0.353903977604 1.96871239181
0.229104713908 1.73925039172
0.24575719815 0.415421535544
0.35348910386 0.89485330155
1.38353837124 0.507868100498
0.243668241465 0.251997508416
0.131101536872 1.26116338042
0.341036632937 0.750737629114
0.0982143309075 0.103688519432
0.169104623611 2.42486837174
0.357663653901 0.236911834867
0.20834019941 2.02098192079
0.353899148027 1.96871299015
0.22910413785 1.73900565086
0.400921027028 0.242231130868
0.34853226914 0.901420813957
0.547893479806 0.744426402942
0.187320462914 0.334069551423
0.131096288189 1.26123984634
0.245351058141 1.02516949491
0.0982764187838 0.107203741335
0.235954930871 2.02046198554
0.408363265982 0.200781318546
0.20831254348 2.02097657035
0.353886656822 1.96871453094
0.22910405322 1.7390027718
0.231194743788 0.588355923164
0.19069102112 1.49316782215
0.256289098692 1.19713292323
0.261843902338 0.244918042892
0.131099128271 1.26118518675
0.281626561138 0.893583619211
0.0979316396078 0.0746700767982
0.136997574661 2.90171847855
0.368516901027 0.232844460389
0.208382933676 2.02099019761
0.35390586599 1.96871216071
0.229104847342 1.73898472799
0.231197090674 0.588339402308
0.22495047174 1.18561519195
0.875724733276 0.595356589043
0.261408701921 0.256299387355
0.133599326088 1.22660042742
0.435314824238 0.612199515462
0.0978084584698 0.0665247089366
0.0645483765374 20.5276411994
0.36822539678 0.232385327413
0.208269864782 2.0209683105
0.353893374926 1.96871370159
0.22910394419 1.73904930367
0.258794629462 0.387775063425
0.357144633659 0.889388352423
0.275601416445 1.14479711549
0.225163075699 0.165165803772
0.131089196997 1.2613312452
0.519145862329 0.554020673188
0.0983065556626 0.10887281557
0.236122316782 2.02045765181
0.602862558288 0.158274409125
0.208295055307 2.02097318407
0.353900091779 1.96871287541
0.22910407335 1.73899484882
0.40193060349 0.24299559213
0.251936154978 1.14565468585
0.272782892303 1.15948437509
0.243429626662 0.249812762175
0.131098321902 1.26120246403
0.639308085287 0.492874436796
0.0979472941485 0.0788132334117
0.138847893515 2.875704058
0.55792103428 0.167056472613
0.208302375555 2.02097459586
0.35389243082 1.96871381954
0.229104108247 1.73974131565
0.252213416773 0.472422023868
0.357564088649 0.889059383196
0.758311636735 0.638174219924
0.225583877121 0.119952186875
0.131108010779 1.26111271889
0.574089200352 0.522159572627
0.00995901232159 0.074966890991
0.0713404708541 11.4637595293
0.396826468428 0.216422068938
0.208370311541 2.02098773479
0.353884769124 1.9687147604
0.229103336253 1.73883425675
0.231183852693 0.586306177964
0.350838956482 0.894432012631
0.721438063752 0.653240592626
0.243451755129 0.246689843906
0.131099546386 1.26118071777
0.574210197166 0.522103484648
0.0983678341097 0.112316516853
0.238941666662 2.02036894639
0.445886375516 0.190527066113
0.208295058572 2.02097319651
0.353876165105 1.96871582395
0.229101902011 1.73943266681
0.231201561027 0.588302282788
0.35709420153 0.889755818425
0.410288651076 0.843954072366
0.186536889036 0.331062075024
0.131101887428 1.26114698075
0.406990654303 0.637212138909
0.0980941189735 0.0827736257556
0.239013015875 2.02035076456
0.361549245085 0.238067188491
0.208350369793 2.02098389666
0.353884769524 1.968714762
0.229104088082 1.73901383725
0.301172434732 0.347293746613
0.357593051103 0.889018734556
0.310470778692 0.994362640081
0.184574548802 0.323709764581
0.131099351596 1.26118352387
0.517973013472 0.55469488538
0.0981829518161 0.101898767795
0.235967477859 2.02045292139
0.556694164968 0.1668335305
0.208325147252 2.02097901189
0.353884769123 1.968714762
0.229101962492 1.73873618852
0.231169725323 0.588468201882
0.357456092471 0.889220224291
0.938596186305 0.585715665375
0.190258166104 0.345728762475
0.131081427034 1.26143955323
0.574487760131 0.521465025966
0.0983982655632 0.114021537467
0.239000069603 2.02036108781
0.368377876301 0.232927718094
0.208292220541 2.02097263417
0.353885712768 1.96871464728
0.229104014562 1.73901840994
0.231171694661 0.58856625701
0.192960704207 1.46707889186
0.831601150935 0.615371842318
0.243549228204 0.249385430148
0.131098547158 1.26118833856
0.549191815622 0.533917456849
0.0979185073683 0.0694397115775
0.175582350282 2.36560826257
0.371096020467 0.23143698297
0.208327589039 2.02097947858
0.353887710007 1.968714398
0.229104053566 1.7390026628
0.231196585495 0.588341197537
0.357394417058 0.88931026338
0.921585814112 0.585536272666
0.183685887387 0.320460151782
0.131099188156 1.26118589755
0.557229969711 0.530945445258
0.0973488572713 0.0174899646278
0.141015415075 2.82847395735
0.0202
0.439289537113 0.192190597433
0.208302380109 2.02097458963
0.353891486325 1.96871393266
0.229104575325 1.73947198078
0.240264687874 0.541999984528
0.431724530638 0.803835394921
0.981487291123 0.572856041237
0.210381472456 0.446384926449
0.131099317842 1.26118383267
0.571838637085 0.523332227201
0.0978995162922 0.0688878433375
0.238808446665 2.02036951691
0.368347326149 0.23294337149
0.208307658019 2.0209756186
0.353890542635 1.9687140506
0.229104078655 1.73899348445
0.231230103054 0.588077646658
0.358204190342 0.887502574116
0.864256673456 0.598871836354
0.2515260521 0.924679247099
0.131098858715 1.2611819378
0.589439335592 0.51437416707
0.0470095549236 0.115639278254
0.0882715858401 5.55792001639
0.211837709773 0.469708762588
0.208307661369 2.02097563726
0.353900092583 1.9687128738
0.229104284673 1.73892670058
0.231194101473 0.58832950535
0.357409903554 0.889245737936
0.655322451832 0.667827339362
0.183941938362 0.321392546912
0.131100093008 1.26117845769
0.552343738367 0.534707087795
0.0983984028731 0.114035825822
0.235971183409 2.0204478654
0.0202
0.451508515532 0.192018725682
0.208342641874 2.02098238761
0.353898204282 1.96871310489
0.229104088356 1.73898777219
0.231212176827 0.588151658004
0.357609976529 0.888991667165
0.27459133646 1.14960526376
0.21503477815 0.0732555258796
0.13110743169 1.26106626982
0.274352230936 0.881937019083
0.0982755732831 0.107118461254
0.0814585760198 6.64738807272
0.445904699554 0.190587280116
0.208325150434 2.02097901811
0.353900092581 1.96871287541
0.229104110502 1.73898097718
0.231192347064 0.588375122201
0.301485101586 1.00647338748
0.271925114905 1.16479812144
0.227051189632 0.136149369955
0.131093454175 1.26129863564
0.574117184787 0.522147480993
0.0982451158427 0.105418715027
0.239008243687 2.02035774611
0.352996343852 0.23832133772
0.208317424566 2.02097750975
0.353889598953 1.96871416694
0.22910407094 1.73900542578
0.231213430657 0.588213279367
0.195664669148 1.43774508555
0.948611313114 0.583246076501
0.208372275583 0.0478184440795
0.131099251733 1.26118497413
0.57049051252 0.524172174558
0.108646530161 0.00628887531765
0.149549024905 2.68541158872
0.554322870268 0.166489916558
0.208365029241 2.02098671709
0.353903977202 1.96871239181
0.229104047219 1.73900584156
0.231196570456 0.588340718062
0.35759909039 0.889008045952
0.27488118379 1.1484622628
0.270652089737 0.200116227913
0.131099843086 1.26117627713
0.275871931796 0.895430919524
0.0985061735887 0.131028342327
0.0939565095879 4.96340678018
1.63430146074e-05 -0.0252322781597
0.207879311869 2.02100026438
0.354311691809 1.96807126728
0.229460926856 1.66023871819
0.273746362474 0.389859181512
0.332558923551 0.930600758989
0.372080756834 0.897889471075
0.250036376373 0.767062669867
0.125550403651 1.35701025098
0.690941623287 0.473290583499
0.0356419287812 0.0895222430787
0.0856791192166 5.82373194177
3.35597339784e-06 -0.0219192676122
0.207935743429 2.02101123801
0.354351362269 1.96806628986
0.229461788052 1.6602020353
0.273795343338 0.389764261034
0.25554272252 1.13821039273
0.456761199607 0.798357817479
0.250036376373 0.767062669867
0.125543015394 1.35710518356
0.291121653527 0.860621004687
0.0336094593685 0.0925159564695
0.226378124734 2.01540706484
0.525062548479 0.171289230357
0.207862225447 2.02099695532
0.354320402306 1.9680701753
0.229461372975 1.66023223601
0.219096524258 0.629337437026
0.193263139499 1.46453785609
0.742440852193 0.644864239164
0.20627067421 0.391862378112
0.125540183155 1.35715439492
0.500533800864 0.567610999565
0.0630858531181 0.163062681968
0.171059940362 2.34971649126
4.11732302936e-05 -0.0281133381786
0.207913843796 2.02100699839
0.354319431067 1.96807029729
0.229460760287 1.66028368379
0.273649688344 0.390151246402
0.326880770365 0.94038350235
0.254996622763 1.20868001834
0.25019941288 0.769558664454
0.12554226103 1.3571186117
0.52677824176 0.549331943766
0.0553151379868 0.136801296813
0.0703758633802 11.8046133031
2.29727773871e-06 -0.0206800813048
0.207859826222 2.020996477
0.354320402706 1.96807017854
0.229460646642 1.6603181806
0.0695507918594 -0.154396336411
0.331884803317 0.931789778077
0.724513097621 0.65046443439
0.254142138436 0.83471365981
0.125540078619 1.35715389545
0.52842652707 0.548225922162
0.0998848223409 0.0995361948836
0.226683887682 2.01547786257
0.607171569507 0.156079273693
0.207886829895 2.02100174552
0.354296216722 1.96807321195
0.229460826113 1.66026148485
0.273833193439 0.389656482635
0.332338629421 0.930970193437
0.555412923919 0.734520820171
0.251137072217 0.784203554936
0.125540043091 1.35715946091
0.519509562106 0.553702346357
0.0997469307678 0.0787979040871
0.226849605115 2.01551321543
3.05735404993e-07 -0.0174807824122
0.207822940798 2.0209893022
0.354317487802 1.96807054127
0.229469131512 1.65995555319
0.303843682662 0.320926856629
0.316295773016 0.961439492533
0.806298888583 0.622956606876
0.219795038918 0.0842815507174
0.129755350358 1.30571209296
0.510026069387 0.560115357839
0.0659857117315 0.159528941685
0.0727700074817 10.396774644
0.0201
0.678905851769 0.144572296776






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-140-4fc246001a0b> in <module>()
      1 fitted_weights_R = np.ones(len(snitz_descriptors))
----> 2 fitted_weights_R = fmin_l_bfgs_b(f2,fitted_weights_R,fprime=None,args=(CIDs_snitz,1e-4,tests,results),approx_grad=True,maxiter=1e6)[0]

/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/scipy/optimize/lbfgsb.py in fmin_l_bfgs_b(func, x0, fprime, args, approx_grad, bounds, m, factr, pgtol, epsilon, iprint, maxfun, maxiter, disp, callback)
    184 
    185     res = _minimize_lbfgsb(fun, x0, args=args, jac=jac, bounds=bounds,
--> 186                            **opts)
    187     d = {'grad': res['jac'],
    188          'task': res['message'],

/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/scipy/optimize/lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, **unknown_options)
    312                 # minimization routine wants f and g at the current x
    313                 # Overwrite f and g:
--> 314                 f, g = func_and_grad(x)
    315         elif task_str.startswith(b'NEW_X'):
    316             # new iteration

/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/scipy/optimize/lbfgsb.py in func_and_grad(x)
    259         def func_and_grad(x):
    260             f = fun(x, *args)
--> 261             g = approx_fprime(x, fun, epsilon, *args)
    262             return f, g
    263     else:

/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/scipy/optimize/optimize.py in approx_fprime(xk, f, epsilon, *args)
    608         ei[k] = 1.0
    609         d = epsilon * ei
--> 610         grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
    611         ei[k] = 0.0
    612 

/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/scipy/optimize/optimize.py in function_wrapper(*wrapper_args)
    279     def function_wrapper(*wrapper_args):
    280         ncalls[0] += 1
--> 281         return function(*(wrapper_args + args))
    282 
    283     return ncalls, function_wrapper

<ipython-input-139-9924729d60f1> in f2(weights, descriptors, alpha, tests, results)
     23         x = angles[key]
     24         y = fraction_correct[key]
---> 25         a,n = fmin_tnc(f_hill,[0.1,2],approx_grad=True,args=(x,y),bounds=((0,99),(-99,99)))[0]
     26         print(a,n)
     27         #slope, intercept, _ , _, _ = linregress(x,y)

/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/scipy/optimize/tnc.py in fmin_tnc(func, x0, fprime, args, approx_grad, bounds, epsilon, scale, offset, messages, maxCGit, maxfun, eta, stepmx, accuracy, fmin, ftol, xtol, pgtol, rescale, disp, callback)
    261             'disp': False}
    262 
--> 263     res = _minimize_tnc(fun, x0, args, jac, bounds, callback=callback, **opts)
    264 
    265     return res['x'], res['nfev'], res['status']

/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/scipy/optimize/tnc.py in _minimize_tnc(fun, x0, args, jac, bounds, eps, scale, offset, mesg_num, maxCGit, maxiter, eta, stepmx, accuracy, minfev, ftol, xtol, gtol, rescale, disp, callback, **unknown_options)
    396                                         offset, messages, maxCGit, maxfun,
    397                                         eta, stepmx, accuracy, fmin, ftol,
--> 398                                         xtol, pgtol, rescale, callback)
    399 
    400     funv, jacv = func_and_grad(x)

/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/scipy/optimize/tnc.py in func_and_grad(x)
    354         def func_and_grad(x):
    355             f = fun(x, *args)
--> 356             g = approx_fprime(x, fun, epsilon, *args)
    357             return f, g
    358     else:

/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/scipy/optimize/optimize.py in approx_fprime(xk, f, epsilon, *args)
    608         ei[k] = 1.0
    609         d = epsilon * ei
--> 610         grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
    611         ei[k] = 0.0
    612 

<ipython-input-133-017710b546aa> in f_hill(params, x, y)
      7 def f_hill(params,x,y):
      8     a,n = params
----> 9     p_d = 1.0 / (1.0 + (x/a)**(-n))
     10     p_c = 1.0/3 + p_d*(2.0/3)
     11     ll = binom.logpmf((y*26).astype(int),26,p_c).sum() # 26 subjects.

KeyboardInterrupt:



In [138]:

    
fitted_weights_R









    Out[138]:





array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])



In [137]:

    
fitted_angles_R, fraction_correct = compute_angles(tests,results,CIDs_snitz,weights=fitted_weights_R)
plot_correct_vs_angles(fitted_angles_R, fraction_correct, color_d=True)



In [40]:

    
fitted_angles_R, fraction_correct = compute_angles(tests,results,CIDs_snitz,weights=fitted_weights_R)
plot_correct_vs_angles(fitted_angles_R, fraction_correct, color_d=True)



In [17]:

    
fitted_weights_nmf_200_R = np.ones(200)
fitted_weights_nmf_200_R = fmin_l_bfgs_b(f2,fitted_weights_nmf_200_R,fprime=None,args=(CIDs_nmf_200,1e-4,tests,results),approx_grad=True,maxiter=1e6)[0]









    



/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/IPython/kernel/__main__.py:5: RuntimeWarning: invalid value encountered in double_scalars
/Users/rgerkin/Dropbox/python3/lib/python3.4/site-packages/IPython/kernel/__main__.py:5: RuntimeWarning: divide by zero encountered in double_scalars






    



0.0384
0.0384
0.0384
0.0384
0.0384
0.0384
0.0384
0.0384
0.0384
0.0384
0.0384
0.0384
0.0365
0.0365
0.0365
0.0365
0.0365
0.0321
0.0321
0.0321
0.0321
0.0321
0.0321
0.0321
0.0284
0.0284
0.0284
0.0284
0.0284
0.0284
0.0284
0.0284
0.0284
0.0284
0.036
0.036
0.036
0.036
0.036
0.036
0.036
0.036
0.036
0.036
0.036
0.0226
0.0226
0.0226
0.0226
0.0226
0.0226
0.0226
0.0226
0.0226
0.0226
0.0226
0.0226
0.0705
0.0705
0.0705
0.0705
0.0705
0.0705
0.0705
0.0705
0.0259
0.0259
0.0259
0.0259
0.0259
0.0259
0.0222
0.0222
0.0222
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0215
0.0209
0.0209
0.0209
0.0209
0.0209
0.0209
0.0209
0.0209
0.0209
0.0204
0.0204
0.0204
0.0204
0.0204
0.0204
0.0204
0.0199
0.0199
0.0199
0.0199
0.0199
0.0199
0.0199
0.0195
0.0195
0.0195
0.0195
0.0195
0.0195
0.0195
0.019
0.019
0.019
0.019
0.019
0.019
0.019
0.019
0.019
0.019
0.019
0.019
0.019
0.019
0.019
0.019
0.0184
0.0184
0.0184
0.0184
0.0184
0.0184
0.0184
0.0184
0.0184
0.0184
0.0184
0.0184
0.0184
0.0187
0.0187
0.0187
0.0187
0.0187
0.0187
0.0187
0.0187
0.0182
0.0182
0.0182
0.0182
0.0182
0.0182
0.0182
0.0182
0.0182
0.0183
0.0183
0.0183
0.0183
0.0183
0.0183
0.0183
0.0183
0.0183
0.0183
0.0181
0.0181
0.0181
0.0181
0.0181
0.0181
0.018
0.018
0.018
0.018
0.018
0.018
0.0178
0.0178
0.0178
0.0178
0.0178
0.0178
0.0178
0.0177
0.0177
0.0177
0.0177
0.0176
0.0176
0.0176
0.0176
0.0176
0.0176
0.0176
0.0176
0.0176
0.0176
0.0176
0.0176
0.0176
0.0176
0.0175
0.0175
0.0175
0.0175
0.0175
0.0175
0.0175
0.0175
0.0175
0.0173
0.0173
0.0173
0.0173
0.0173
0.0173
0.0173
0.0173
0.0172
0.0172
0.0172
0.0172
0.0172
0.0172
0.0172
0.0172
0.0172
0.0172
0.017
0.017
0.017
0.017
0.017
0.0169
0.0169
0.0169
0.0169
0.0169
0.0169
0.0169
0.0169
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0167
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0166
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0165
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0164
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0163
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0162
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.0161
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.016
0.0159
0.0159
0.0159
0.0159
0.0159
0.0159
0.0159
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0158
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157
0.0157



In [18]:

    
fitted_angles_nmf_200_R, fraction_correct = compute_angles(tests,results,CIDs_nmf_200,weights=fitted_weights_nmf_200_R)
plot_correct_vs_angles(fitted_angles_nmf_200_R, fraction_correct, color_d=True)



In [134]:

    
fitted_angles_nmf_200_R, fraction_correct = compute_angles(tests,results,CIDs_nmf_200,
                                                           weights=fitted_weights_nmf_200_R,
                                                           method='sum',
                                                           method_param=1.0)
plot_correct_vs_angles(fitted_angles_nmf_200_R, fraction_correct, color_d=True)



In [119]:

    
p_c[7],(y*26)[7]









    Out[119]:





(0.85051715761393876, 14.999999999999998)



In [ ]: