In [1]:
!pip install plinkio
##Load data:
import os
import re
import numpy as np
import pandas as pd
from plinkio import plinkfile
import time
import sklearn.covariance as Cov
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from statsmodels.discrete.discrete_model import Probit
#from scipy.linalg.blas import dsyrk 
    #--can't find a way to get this working. Perhaps blas routines are missing.
    
data_path = '/home/jovyan/work/LEAP/leap/regression/dataset1'
os.chdir(data_path)


Requirement already satisfied (use --upgrade to upgrade): plinkio in /opt/conda/lib/python3.4/site-packages

In [ ]:
"""
author: gene burinskiy

Goal: 
Finding a set of individuals who are related to other individuals in the study. 
LEAP employs a greedy algorithm to find a small subset of such individuals, 
such that after their exclusion, there are no related individuals in the study. 
These individuals are excluded from the analysis in stages 3 and 4 below, 
but after fitting a model in stage 4, their liabilities are estimated along with 
other indviduals. All individuals are considered in the GWAS stage (stage 5).

source for algorithm and methods. References are cited in functions. 
https://github.com/omerwe/LEAP/blob/master/leap/regression/Leap_example.ipynb

I, in some cases, heavily modify or write new code to perform the tasks in the
Goal. 

NOTES: this is an exploration file and includes code that would otherwise be redundant. 
"""

In [2]:
##Load data:
bed = plinkfile.open("dataset1")

loci = bed.get_loci()
print("Length of locuses", len(loci))
chromosomes = np.unique([x.chromosome for x in loci])
print("# of chromosomes in data:",chromosomes)

samples = bed.get_samples()
print("Number of individuals in data:", len(samples))


Length of locuses 10499
# of chromosomes in data: [ 1  2  3  4  5  6  7  8  9 10]
Number of individuals in data: 1000

In [3]:
##Place data into a dataframe:
mat = np.zeros((len(loci),len(samples)), dtype='int16') #1/4 of the taken up space by using int16

##don't know a faster method of extracting the data from the bed file.
i=0
for row in bed:
    mat[i,:] = np.array([snp for snp in row])
    i+=1
    
#this matrix is equivalent to transposed bed.val
print("Data type:", mat.dtype)
print("Size of bed matrix: %4.0fmb\n" %(mat.nbytes/(1024**2)))

#create a multi-indexed column space
tuples = [(x.chromosome,x.name) for x in loci]
ml_index = pd.MultiIndex.from_tuples(tuples, names = ['chromosome', 'snp'])

df = pd.DataFrame(mat.transpose(), columns=ml_index, index = [x.iid for x in bed.get_samples()]) 
df.info()
df.iloc[:5,:5]


Data type: int16
Size of bed matrix:   20mb

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, person1 to person1000
Columns: 10499 entries, (1, csnp18) to (10, snp10483)
dtypes: int16(10499)
memory usage: 20.0+ MB
Out[3]:
chromosome 1
snp csnp18 csnp35 csnp59 csnp78 csnp85
person1 2 1 1 2 1
person2 1 0 2 2 2
person3 0 2 2 2 2
person4 2 1 2 2 1
person5 0 1 2 1 2

In [4]:
##compute covariance matrix between individuals, remove those who are too close to each other.
#they LEAP code uses dsyrk which halves the computational time. Alas, we can't use it y

df = df.astype('float64')-df.astype('float64').mean() 
df.info()

dot_cov = np.dot(df, df.transpose())/(df.shape[1]-1) #having difficulties with scipy's linalg module
#note that the above takes more than half the time of np.cov
print("\nCovariance shape:" , dot_cov.shape)
print("Covariance memory usage in mb:", dot_cov.nbytes/(1024**2))
dot_cov[:5,:5]


<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, person1 to person1000
Columns: 10499 entries, (1, csnp18) to (10, snp10483)
dtypes: float64(10499)
memory usage: 80.1+ MB

Covariance shape: (1000, 1000)
Covariance memory usage in mb: 7.62939453125
Out[4]:
array([[ 0.36816713,  0.00128849, -0.00865588, -0.00119474,  0.0038927 ],
       [ 0.00128849,  0.35826201,  0.0033948 ,  0.00228287,  0.00136917],
       [-0.00865588,  0.0033948 ,  0.36285412,  0.00443604, -0.00057367],
       [-0.00119474,  0.00228287,  0.00443604,  0.36310693,  0.00183888],
       [ 0.0038927 ,  0.00136917, -0.00057367,  0.00183888,  0.37099567]])

In [5]:
##trying to use a shrinkage estimator for the covariance matrix. Inspired by the computeCovar
##function in https://github.com/omerwe/LEAP/blob/master/leap/leapUtils.py
oa_fit = Cov.OAS().fit(df.transpose())
lw_fit = Cov.LedoitWolf().fit(df.transpose())

In [37]:
cutoff = .05
oa_cov = oa_fit.covariance_
bool_arr =  np.tril(oa_cov, k=-1)>cutoff
y_idx,_ = np.where(bool_arr)
print("Under Oracle shrinkage:")
print("shape of y:",np.unique( y_idx).shape)
print(np.unique(y_idx))


Under Oracle shrinkage:
shape of y: (45,)
[ 20  37  47  62  66  76  82  89 104 112 128 142 151 156 160 161 163 164
 165 172 173 189 194 196 197 199 205 209 210 211 213 214 217 221 222 232
 233 234 241 242 244 246 355 439 499]

In [38]:
np_cov = np.cov(df, ddof=0)
bool_arr =  np.tril(np_cov, k=-1)>cutoff
y_idx,_ = np.where(bool_arr)
print("Under numpy cov:")
print("shape of y:", np.unique(y_idx).shape)
print(np.unique(y_idx))


Under numpy cov:
shape of y: (47,)
[ 20  37  47  62  66  76  82  89 104 112 128 142 151 156 160 161 162 163
 164 165 172 173 187 189 194 196 197 199 205 209 210 211 213 214 217 221
 222 232 233 234 241 242 244 246 355 439 499]

In [41]:
lw_cov = lw_fit.covariance_
bool_arr =  np.tril(lw_cov, k=-1)>cutoff
y_idx,_ = np.where(bool_arr)
print("Under Ledoit-Wolf estimate")
print("shape of y:", np.unique(y_idx).shape)
print(y_idx)


Under Ledoit-Wolf estimate
shape of y: (37,)
[ 20  37  66  76  82  89 104 112 128 142 156 161 163 164 172 173 189 194
 197 199 205 209 210 211 213 214 217 221 222 232 233 234 242 244 246 246
 355 439]

In [50]:
np_s, np_U = np.linalg.eigh(np_cov, 'L')
oa_s,oa_U = np.linalg.eigh(oa_cov, 'L')
lw_s, lw_U = np.linalg.eigh(lw_cov, 'L')
dot_s, dot_U = np.linalg.eigh(dot_cov, 'L')

In [51]:
print(np_s[:3])
print(oa_s[-3:])
print(lw_s[:3])
print(dot_s[:3])


[  1.05943703e-12   1.30036478e-01   1.31650207e-01]
[ 0.57159555  0.57757169  2.35200119]
[ 0.1773261   0.24439076  0.24522302]
[ -1.49183313e-06   1.30050436e-01   1.31666958e-01]

In [5]:
cutoff = .05
bool_arr =  np.tril(dot_cov, k=-1)>cutoff
y_idx,_ = np.where(bool_arr)
print("shape of y:", y_idx.shape)
print("\nremoving %d individuals" %y_idx.shape[0])

#note, they marked 54 so we marked more peeps, we effectively remove 56 rows. Something doesn't line up.
indxToKeep = set(range(dot_cov.shape[0]))
print("Original num of ppl:", len(indxToKeep))

[indxToKeep.remove(i) for i in np.unique(y_idx)]
keepArr = np.array(list(indxToKeep))
print("num of kept ppl:", keepArr.shape[0])


shape of y: (56,)

removing 56 individuals
Original num of ppl: 1000
num of kept ppl: 953

In [10]:
#exploring different ways to exclude individuals found above.
cov_m = np.ma.array(cov,mask=False)
cov_m.mask[y_idx,:] = True
cov_m.mask[:,y_idx] = True

print(cov_m.sum())

cov_c = np.delete(np.delete(cov, y_idx, axis=0), y_idx, axis=1)
print(cov_c.sum())


22.203
22.2029

In [6]:
##Our calc_h2 function for Step 3
#uses the calc_h2.calc_h2 functions
from sklearn.linear_model import LogisticRegression
from scipy import stats

#read in phenofile:
phenos = pd.read_csv("dataset1.phe", sep=' ', header=None, engine='c')
phenos.columns = ['fam', 'person', 'pheno']
phenos.set_index(keys = 'person', inplace=True)
phenos.iloc[:5,:5]


Out[6]:
fam pheno
person
person1 FAM1 0
person2 FAM1 0
person3 FAM1 0
person4 FAM1 0
person5 FAM1 0

In [7]:
def calcLiabThresholds_3xx(U,s, keepArr, phe, numRemovePCs=10, prevalence = .001, covar=None): 
    """
    INPUTS:
        1. U - left eigenvectors of covariance matrix (ie kinship matrix)
        2. S - eigenvalues of covariance matrix (ie kinship matrix)
        3. keepArr - np.array of indexes that exclude highly related individuals
        4. phe - np.array of phenotypes (binary only)
        5. covar - god knows. specified in author functions but remains undefined. 
    OUTPUT:
        1. probs - probability estimates from a regularized logistic regression
        2. threshold - no idea what this is, I assume they're estimated liabilities?
    NOTES:
        original code can be found on:
        https://github.com/omerwe/LEAP/blob/master/leap/calc_h2.py
    """
    ##------------------------------------ CalcLiabThreshold -----------------------------------
    ##probs, thresholds = calcLiabThreholds(U, S, keepArr, phe, numRemovePCs, prevalence, covar) 

    #This is equivalent to an SVD decomposition; note their covar parameter is defaulted to None
    G = U[:, -numRemovePCs:] * np.sqrt(s[-numRemovePCs:])

    #perform a regularized logistic regression. I trust their parameter settings.
    Logreg = LogisticRegression(penalty='l2', C=500000, fit_intercept=True)
    Logreg.fit(G[keepArr, :], phe.iloc[keepArr])

    #Compute individual thresholds
    probs = Logreg.predict_proba(G)[:,1]
    
    #Compute thresholds
    prev = prevalence
    P = np.sum(phe==1) / float(phe.shape[0])
    #K = prev --why, why in the (insert explicative) hell do they do this?
    Ki = prev*(1-prev) / (P*(1-prev)) * probs / (1 + prev*(1-prev) / (P*(1-prev))*probs - probs)
    thresholds = stats.norm(0,1).isf(Ki)
    thresholds[Ki>=1.] = -999999999
    thresholds[Ki<=0.] = 999999999
    
    return([probs, thresholds])

In [10]:
def calcH2Binary(XXT_o, phe_o, probs_o, thresholds_o, keepArr_o, prev, h2coeff):
    """
    INPUT:
        1. XXT - covariance matrix (kinship matrix) * number of snps
        2. phe - np.array of phenotypes. In our case, they're binary.
        3. probs - np.array of probabilities
        4. thresholds - np.array of something (I believe they're estimated liabilities)
        5. keepArr - np.array of indexes that exclude highly related individuals.
        6. prev - prevalence
        7. h2coeff - no idea. they set it to 1.0 for synthetic data. .875 otherwise
    NOTES:
        Many items have been removed for sake of more compact code. Namely, the actions if
        thresholds is None. 
        Original code can be found on:
        https://github.com/omerwe/LEAP/blob/master/leap/calc_h2.py
    """
    K = prev
    P = np.sum(phe_o>0) / float(phe_o.shape[0])
    
    #index out individuals we do not want. In order to avoid reassining variables,
    #I assign the input objects to new objects which are views.
    XXT = XXT_o[np.ix_(keepArr, keepArr)]
    phe = phe_o[keepArr]

    probs = probs_o[keepArr]
    thresholds = thresholds_o[keepArr]
    
    Ki = K*(1-P) / (P*(1-K)) * probs / (1 + K*(1-P) / (P*(1-K))*probs - probs)
    phit = stats.norm(0,1).pdf(thresholds)
    probsInvOuter = np.outer(probs*(1-probs), probs*(1-probs))
    y = np.outer(phe-probs, phe-probs) / np.sqrt(probsInvOuter)	
    sumProbs = np.tile(np.column_stack(probs).T, (1,probs.shape[0])) + np.tile(probs, (probs.shape[0], 1))
    Atag0 = np.outer(phit, phit) * (1 - (sumProbs)*(P-K)/(P*(1-K)) + np.outer(probs, probs)*(((P-K)/(P*(1-K)))**2)) / np.sqrt(probsInvOuter)
    B0 = np.outer(Ki + (1-Ki)*(K*(1-P))/(P*(1-K)), Ki + (1-Ki)*(K*(1-P))/(P*(1-K)))
    x = (Atag0 / B0 * h2coeff) * XXT

    y = y[np.triu_indices(y.shape[0], 1)]
    x = x[np.triu_indices(x.shape[0], 1)]

    slope, intercept, rValue, pValue, stdErr = stats.linregress(x,y)
    
    return slope

In [80]:
import numpy as np
import sklearn.linear_model
import scipy.optimize as opt

def evalProbitReg(beta, X, cases, controls, thresholds, invRegParam, normPDF, h2):
    """
    NOTES: not much to do here as everything is in numpy. 
    """
    XBeta = np.ravel(X.dot(beta)) - thresholds
    phiXBeta = normPDF.pdf(XBeta)
    PhiXBeta = normPDF.cdf(XBeta)

    logLik = np.sum(np.log(PhiXBeta[cases])) + np.sum(np.log(1-PhiXBeta[controls]))	
    w = np.zeros(X.shape[0])
    w[cases] = -phiXBeta[cases] / PhiXBeta[cases]
    w[controls] = phiXBeta[controls] / (1-PhiXBeta[controls])
    grad = X.T.dot(w)

    #regularize
    logLik -= 0.5*invRegParam * beta.dot(beta)	#regularization	
    grad += invRegParam * beta
    return (-logLik, grad)

def probitRegHessian(beta, X, cases, controls, thresholds, invRegParam, normPDF, h2):
    """
    NOTES: not much to do here as everything is in numpy. Though, I precalculated
    PhiXBeta and then subset that because it was originally done for each subset. It is, trivially,
    faster to precompute the element-wise squaring and then subset. 
    """
    XBeta = np.ravel(X.dot(beta)) - thresholds
    phiXBeta = normPDF.pdf(XBeta)
    PhiXBeta = normPDF.cdf(XBeta)

    XbetaScaled = XBeta #/(1-h2)

    #PhiXBeta2 = np.square(PhiXBeta)
    R = np.zeros(X.shape[0])
    R[cases] = (XbetaScaled[cases]*PhiXBeta[cases] + phiXBeta[cases]) / PhiXBeta[cases]**2
    R[controls] = (-XbetaScaled[controls]*(1-PhiXBeta[controls]) + phiXBeta[controls]) / (1 - PhiXBeta[controls])**2

    R *= phiXBeta
    H = (X.T * R).dot(X)
    H += invRegParam
    return H


def probitRegression(X, y, thresholds, numSNPs, numFixedFeatures, h2, useHess, maxFixedIters, epsilon, nofail):
    """
    NOTE: If I had more time, I would probably use PyMC3 for this ... eventually. For now, just removed superfluous
    parts.
    0. print statement parantheses added. 
    1. Num of Fixed effects = 0 => delete fixed effect estimation code. 
    """
    regParam = h2 /  float(numSNPs)	
    Linreg = sklearn.linear_model.Ridge(alpha=1.0/(2*regParam), fit_intercept=False, normalize=False, solver='lsqr')
    Linreg.fit(X, y)
    initBeta = Linreg.coef_
    np.random.seed(1234)

    normPDF = stats.norm(0, np.sqrt(1-h2))
    invRegParam = 1.0/regParam
    controls = (y==0)
    cases = (y==1)
    funcToSolve = evalProbitReg
    hess =(probitRegHessian if useHess else None)
    jac= True
    method = 'Newton-CG'
    args = (X, cases, controls, thresholds, invRegParam, normPDF, h2)
    print('Beginning Probit regression...')
    t0 = time.time()
    optObj = opt.minimize(funcToSolve, x0=initBeta, args=args, jac=jac, method=method, hess=hess)
    print('Done in', '%0.2f'%(time.time()-t0), 'seconds')
    if (not optObj.success):
        print('Optimization status:', optObj.status)
        print(optObj.message)
        if (nofail == 0): raise Exception('Probit regression failed with message: ' + optObj.message)
    beta = optObj.x

    return beta

def probit_3xx(df, phe, h2, prev, U,s, keepArr, thresholds=None,  covar=None,  nofail=0, outFile = None,
    numSkipTopPCs=10, mineig=1e-3, hess=1, recenter=1, maxFixedIters=1e2, epsilon=1e-3, treatFixedAsRandom=False):
    """
    INPUT:
        1. df - pandas data frame of normalized snp values. df excludes current chromosome.
        2. ph - pandas data frame of phenotypes
        4. h2 - np.array of calculated something
        5. prev - prevalence
        6. U,S - left eigenvectors and eigenvalues.
        7. thresholds - calculated thresholds. 
    Modifications:
        1. No longer read in the bed, phenotype file, 
        2. no longer set binary phenotype cases.
        3. get U and s directly, not from eigen dictionary
        5. removed covar statement.
        6. commented out saving of file as the current approach seems slow and requires interaction with the disk.
            there are faster ways of saving these things BUT no time to fix that now until later. Also, just for
            clarity, rewrote the code to adhere to modern Python practises. Finally, that section does not
            seem to pertain to the output. 
        7. They use "structure" instead of dictionary -> C world. 
        8. Instead of a dictionary, I return a Pandas DataFrame ... because we <3 Pandas. Also, header was set to
            None, so we remove it as output. Finally, we return directly, don't save it to LiabStructure or 
            liab_df in our case.
        9. Replaced a few probitRegression inputs to adhere to our data structures.
    Default parameters set from the argparse section in the original code. Original code can be found
    in:
    https://github.com/omerwe/LEAP/blob/master/leap/probit.py
    """
    ncols = df.shape[1]

    #run probit regression
    t = stats.norm(0,1).isf(prev)
    if (thresholds is not None): t = thresholds

    S = np.sqrt(s*ncols)
    goodS = (S>mineig)
    if (numSkipTopPCs > 0): goodS[-numSkipTopPCs:] = False
    if (np.sum(~goodS) > 0): print('\t\tRemoving', np.sum(~goodS), 'PCs with low variance')
    G = U[:, goodS]*S[goodS]
    
    #Recenter G	to only consider the unrelated individuals
    if recenter: G -= np.mean(G[keepArr, :], axis=0)
    else: G -= np.mean(G, axis=0)
    
    #Run Probit regression
    numFixedFeatures = 0
    probitThresh = (t if thresholds is None else t[keepArr])
    
    #I believe bed.sid.shape is the sid_count. In our data, that is sid_count = df.shape[1] 
    beta = probitRegression(G[keepArr, :], phe[keepArr], probitThresh, ncols, numFixedFeatures, h2, hess, maxFixedIters, epsilon, nofail)
    
    #predict probabilies
    meanLiab = G.dot(beta)
    liab = meanLiab.copy()
    indsToFlip = np.logical_or( np.logical_and((liab <= t), (phe>0.5)), np.logical_and((liab > t),(phe<0.5)))
    liab[indsToFlip] = stats.norm(0,1).isf(prev)
    
    #Return phenotype dictionary with liabilities
    return pd.DataFrame( {'vals':liab,'person':df.index})

In [82]:
#with multi-index, we index by using the number of the chromosome. 
#This avoids copying of data -> we use views on the data. Immeasurably more efficient

prevalence = .001
numRemovePCs = 10
#chrom = chromosomes[2]
for chrom in chromosomes[:3]:
    print("Working on chromosome: %s" %chrom)

    exclude_chrom = set(chromosomes)
    exclude_chrom.remove(chrom) #set all chromosomes except current
    exclude_chrom = list(exclude_chrom)

    t0 = time.time()
    #Note that the original code puts cov, s, U into a dictionary called "eigen"
    #They do not actually perform an SVD decomposition. Instead, they compute
    #the covariance matrix, decompose that and use an equivalence relation between
    #SVD and the decomposition of the covariance matrix. 

    #XXT = np.dot(df[exclude_chrom], df[exclude_chrom].transpose())
    #s,U = np.linalg.eigh(XXT, 'L') #would use scipy except -again- can't get it to load.

    XXT = Cov.OAS().fit(df[exclude_chrom].transpose()).covariance_
    s,U = np.linalg.eigh(XXT, 'L')

    #calc_h2 function
    if numRemovePCs>0:
        t_XXT = XXT -  (U[:,-numRemovePCs:]*s[-numRemovePCs:]).dot(U[:,-numRemovePCs:].transpose())

    pheUnique = np.unique(phenos.pheno)
    isCaseControl = pheUnique.shape[0] == 2 #trivial condition for us

    if ~np.all(pheUnique == np.array([0,1])):
        pheMean = phenos.pheno.mean()
        phenos.pheno[phenos.pheno <= pheMean] = 0
        phenos.pheno[phenos.pheno> pheMean] = 1

    probs, thresholds= calcLiabThresholds_3xx(U,s, keepArr, phenos.pheno)
    #h2coeff = .875 but set to 1.0 for synthetic data. 
    h2 = calcH2Binary(t_XXT, phenos.pheno, probs, thresholds, keepArr, prev=prevalence, h2coeff=1.0)
    print("\th2 values: %f" %h2)
    
    #liabs = probit_3xx(df, phenos.pheno, h2, prevalence, U,s, keepArr)

    print("\t Took %.2f seconds" %(time.time()-t0))


Working on chromosome: 1
	h2 values: 0.438676
	 Took 0.83 seconds
Working on chromosome: 2
	h2 values: 0.426759
	 Took 0.80 seconds
Working on chromosome: 3
	h2 values: 0.446566
	 Took 0.70 seconds

In [ ]:


In [33]:
%%timeit
np.dot(df, df.transpose())/df.shape[1]


1 loop, best of 3: 154 ms per loop

In [32]:
%%timeit
np.cov(df)


1 loop, best of 3: 354 ms per loop