Implementation of LEAP algorithm
In [1]:
!pip install plinkio
import os
import re
import numpy as np
import pandas as pd
from plinkio import plinkfile
import time
#from scipy.linalg.blas import dsyrk
#--can't find a way to get this working. Perhaps blas routines are missing.
data_path = 'dataset1'
os.chdir(data_path)
In [2]:
##Load data:
bed = plinkfile.open("dataset1")
loci = bed.get_loci()
print("Length of locuses", len(loci))
chromosomes = np.unique([x.chromosome for x in loci])
print("# of chromosomes in data:",chromosomes)
samples = bed.get_samples()
print("Number of individuals in data:", len(samples))
In [3]:
##Place data into a dataframe:
mat = np.zeros((len(loci),len(samples)), dtype='int16') #1/4 of the taken up space by using int16
##don't know a faster method of extracting the data from the bed file.
i=0
for row in bed:
mat[i,:] = np.array([snp for snp in row])
i+=1
#this matrix is equivalent to transposed bed.val
print("Data type:", mat.dtype)
print("Size of bed matrix: %4.0fmb\n" %(mat.nbytes/(1024**2)))
#create a multi-indexed column space
tuples = [(x.chromosome,x.name) for x in loci]
ml_index = pd.MultiIndex.from_tuples(tuples, names = ['chromosome', 'snp'])
df = pd.DataFrame(mat.transpose(), columns=ml_index, index = [x.iid for x in bed.get_samples()])
df.info()
df.iloc[:5,:5]
Out[3]:
$1$. Find and exclude related individuals (kinship coeff > 0.05)
In [4]:
##compute covariance matrix between individuals, remove those who are too close to each other.
#they LEAP code uses dsyrk which halves the computational time. Alas, we can't use it y
df = df.astype('float32')-df.astype('float32').mean()
print(df.iloc[:5,:5])
df.info() #roughly doubled memory usage though still not the 80mb it was earlier
cov = np.dot(df, df.transpose())/df.shape[1] #having difficulties with scipy's linalg module
#note that the above takes more than half the time of np.cov
print("\nCovariance shape:" , cov.shape)
print("Covariance memory usage in mb:", cov.nbytes/(1024**2))
cov[:5,:5]
Out[4]:
In [5]:
cutoff = .05
bool_arr = np.tril(cov, k=-1)>cutoff
y_idx,_ = np.where(bool_arr)
print("shape of y:", y_idx.shape)
print("\nremoving %d individuals" %y_idx.shape[0])
#note, they marked 54 so we marked more peeps, we effectively remove 47. Something doesn't line up.
indxToKeep = set(range(cov.shape[0]))
[indxToKeep.remove(i) for i in np.unique(y_idx)]
keepArr = np.array(list(indxToKeep))
keepArr.shape
Out[5]:
In [6]:
# Keep nonrelated individuals
df = df.ix[keepArr]
df.shape
Out[6]:
$2$. Compute an eigendecomposition of kinship matrix
In [7]:
import scipy.linalg as la
def eigendecomp(cov):
s,U = la.eigh(cov)
s[s<0]=0
ind = np.argsort(s)
ind = ind[s>1e-12]
U = U[:,ind]
s = s[ind]
return s,U
In [8]:
eigendecomp(cov)
Out[8]:
$3$. Compute heritability (h2) using the method of Golan et al.
In [9]:
from sklearn.linear_model import LogisticRegression
from scipy import stats
#read in phenofile:
phenos = pd.read_csv("dataset1.phe", sep=' ', header=None, engine='c')
phenos.columns = ['fam', 'person', 'pheno']
phenos.set_index(keys = 'person', inplace=True)
phenos.iloc[:5,:5]
Out[9]:
In [10]:
def calcH2Binary(XXT_o, phe_o, probs_o, thresholds_o, keepArr_o, prev, h2coeff):
"""
INPUT:
1. XXT - covariance matrix (kinship matrix)
2. phe - np.array of phenotypes. In our case, they're binary.
3. probs - np.array of probabilities
4. thresholds - np.array of something (I believe they're estimated liabilities)
5. keepArr - np.array of indexes that exclude highly related individuals.
6. prev - prevalence
7. h2coeff - no idea. they set it to 1.0
NOTES:
Many items have been removed for sake of more compact code. Namely, the actions if
thresholds is None.
Original code can be found on:
https://github.com/omerwe/LEAP/blob/master/leap/calc_h2.py
"""
K = prev
P = np.sum(phe_o>0) / float(phe_o.shape[0])
#index out individuals we do not want. In order to avoid reassining variables,
#I assign the input objects to new objects which are views.
XXT = XXT_o[np.ix_(keepArr, keepArr)]
phe = phe_o[keepArr]
probs = probs_o[keepArr]
thresholds = thresholds_o[keepArr]
Ki = K*(1-P) / (P*(1-K)) * probs / (1 + K*(1-P) / (P*(1-K))*probs - probs)
phit = stats.norm(0,1).pdf(thresholds)
probsInvOuter = np.outer(probs*(1-probs), probs*(1-probs))
y = np.outer(phe-probs, phe-probs) / np.sqrt(probsInvOuter)
sumProbs = np.tile(np.column_stack(probs).T, (1,probs.shape[0])) + np.tile(probs, (probs.shape[0], 1))
Atag0 = np.outer(phit, phit) * (1 - (sumProbs)*(P-K)/(P*(1-K)) + np.outer(probs, probs)*(((P-K)/(P*(1-K)))**2)) / np.sqrt(probsInvOuter)
B0 = np.outer(Ki + (1-Ki)*(K*(1-P))/(P*(1-K)), Ki + (1-Ki)*(K*(1-P))/(P*(1-K)))
x = (Atag0 / B0 * h2coeff) * XXT
y = y[np.triu_indices(y.shape[0], 1)]
x = x[np.triu_indices(x.shape[0], 1)]
slope, intercept, rValue, pValue, stdErr = stats.linregress(x,y)
return slope
In [11]:
def calcLiabThresholds_3xx(U,s, keepArr, phe, numRemovePCs=10, prevalence = .001, covar=None):
"""
INPUTS:
1. U - left eigenvectors of covariance matrix (ie kinship matrix)
2. S - eigenvalues of covariance matrix (ie kinship matrix)
3. keepArr - np.array of indexes that exclude highly related individuals
4. phe - np.array of phenotypes (binary only)
5. covar - god knows. specified in author functions but remains undefined.
OUTPUT:
1. probs - probability estimates from a regularized logistic regression
2. threshold - no idea what this is, I assume they're estimated liabilities?
NOTES:
original code can be found on:
https://github.com/omerwe/LEAP/blob/master/leap/calc_h2.py
"""
#another part of the calc_h2 function
prev=prevalence
numRemovePCs=10 #their default value; as far as I'm aware, they do not input different values
if numRemovePCs>0:
t_cov = cov - (U[:,-numRemovePCs:]*s[-numRemovePCs:]).dot(U[:,-numRemovePCs:].transpose())
pheUnique = np.unique(phe)
isCaseControl = pheUnique.shape[0] == 2 #trivial condition for us
if ~np.all(pheUnique == np.array([0,1])):
pheMean = phe.mean()
phe[phe <= pheMean] = 0
phe[phe> pheMean] = 1
#probs, thresholds = calcLiabThreholds(U, S, keepArr, phe, numRemovePCs, prevalence, covar)
#This is equivalent to an SVD decomposition; note their covar parameter is defaulted to None
G = U[:, -numRemovePCs:] * np.sqrt(s[-numRemovePCs:])
#perform a regularized logistic regression. I trust their parameter settings.
Logreg = LogisticRegression(penalty='l2', C=500000, fit_intercept=True)
Logreg.fit(G[keepArr, :], phe.iloc[keepArr])
#Compute individual thresholds
probs = Logreg.predict_proba(G)[:,1]
#Compute thresholds
P = np.sum(phe==1) / float(phe.shape[0])
#K = prev --why, why in the (insert explicative) hell do they do this?
Ki = prev*(1-prev) / (P*(1-prev)) * probs / (1 + prev*(1-prev) / (P*(1-prev))*probs - probs)
thresholds = stats.norm(0,1).isf(Ki)
thresholds[Ki>=1.] = -999999999
thresholds[Ki<=0.] = 999999999
return([probs, thresholds])
$4$. Estimate liabilities
In [12]:
import numpy as np
import sklearn.linear_model
import scipy.optimize as opt
# From LEAP documentation
'''
def evalProbitReg(beta, X, cases, controls, thresholds, invRegParam, normPDF, h2):
"""
NOTES: not much to do here as everything is in numpy.
"""
XBeta = np.ravel(X.dot(beta)) - thresholds
phiXBeta = normPDF.pdf(XBeta)
PhiXBeta = normPDF.cdf(XBeta)
logLik = np.sum(np.log(PhiXBeta[cases])) + np.sum(np.log(1-PhiXBeta[controls]))
w = np.zeros(X.shape[0])
w[cases] = -phiXBeta[cases] / PhiXBeta[cases]
w[controls] = phiXBeta[controls] / (1-PhiXBeta[controls])
grad = X.T.dot(w)
#regularize
logLik -= 0.5*invRegParam * beta.dot(beta) #regularization
grad += invRegParam * beta
return [-logLik, grad]
def probitRegHessian(beta, X, cases, controls, thresholds, invRegParam, normPDF, h2):
"""
NOTES: not much to do here as everything is in numpy. Though, I precalculated
PhiXBeta and then subset that because it was originally done for each subset. It is, trivially,
faster to precompute the element-wise squaring and then subset.
"""
XBeta = np.ravel(X.dot(beta)) - thresholds
phiXBeta = normPDF.pdf(XBeta)
PhiXBeta = normPDF.cdf(XBeta)
XbetaScaled = XBeta #/(1-h2)
PhiXBeta2 = np.square(PhiXBeta)
R = np.zeros(X.shape[0])
R[cases] = (XbetaScaled[cases]*PhiXBeta[cases] + phiXBeta[cases]) / PhiXBeta2[cases]
R[controls] = (-XbetaScaled[controls]*(1-PhiXBeta[controls]) + phiXBeta[controls]) / (1 - PhiXBeta2[controls])
R *= phiXBeta
H = (X.T * R).dot(X)
H += invRegParam
return H
def probitRegression(X, y, thresholds, numSNPs, numFixedFeatures, h2, useHess, maxFixedIters, epsilon, nofail):
"""
If I had more time, I would probably use PyMC3 for this ... eventually. For now, just removed superfluous
parts. Can also cythonize the loop in "Fit fixed effects" -- for later.
"""
regParam = h2 / float(numSNPs)
Linreg = sklearn.linear_model.Ridge(alpha=1.0/(2*regParam), fit_intercept=False, normalize=False, solver='lsqr')
Linreg.fit(X, y)
initBeta = Linreg.coef_
np.random.seed(1234)
normPDF = stats.norm(0, np.sqrt(1-h2))
invRegParam = 1.0/regParam
controls = (y==0)
cases = (y==1)
funcToSolve = evalProbitReg
hess =(probitRegHessian if useHess else None)
jac= True
method = 'Newton-CG'
args = (X, cases, controls, thresholds, invRegParam, normPDF, h2)
print 'Beginning Probit regression...'
t0 = time.time()
optObj = opt.minimize(funcToSolve, x0=initBeta, args=args, jac=jac, method=method, hess=hess)
print 'Done in', '%0.2f'%(time.time()-t0), 'seconds'
if (not optObj.success):
print 'Optimization status:', optObj.status
print optObj.message
if (nofail == 0): raise Exception('Probit regression failed with message: ' + optObj.message)
beta = optObj.x
#Fit fixed effects
if (numFixedFeatures > 0):
thresholdsEM = np.zeros(X.shape[0]) + thresholds
for i in xrange(maxFixedIters):
print 'Beginning fixed effects iteration', i+1
t0 = time.time()
prevBeta = beta.copy()
#Learn fixed effects
thresholdsTemp = thresholdsEM - X[:, numFixedFeatures:].dot(beta[numFixedFeatures:])
args = (X[:, :numFixedFeatures], cases, controls, thresholdsTemp, 0, normPDF, h2)
optObj = opt.minimize(funcToSolve, x0=beta[:numFixedFeatures], args=args, jac=True, method=method, hess=hess)
if (not optObj.success): print optObj.message; #raise Exception('Learning failed with message: ' + optObj.message)
beta[:numFixedFeatures] = optObj.x
#Learn random effects
thresholdsTemp = thresholdsEM - X[:, :numFixedFeatures].dot(beta[:numFixedFeatures])
args = (X[:, numFixedFeatures:], cases, controls, thresholdsTemp, invRegParam, normPDF, h2)
optObj = opt.minimize(funcToSolve, x0=beta[numFixedFeatures:], args=args, jac=True, method=method, hess=hess)
if (not optObj.success): print optObj.message; #raise Exception('Learning failed with message: ' + optObj.message)
beta[numFixedFeatures:] = optObj.x
diff = np.sqrt(np.mean(beta[:numFixedFeatures]**2 - prevBeta[:numFixedFeatures]**2))
print 'Done in', '%0.2f'%(time.time()-t0), 'seconds'
print 'Diff:', '%0.4e'%diff
if (diff < epsilon): break
return beta
def probit(bed, pheno, h2, prev, eigen, outFile, keepArr, thresholds,covar=None, nofail=0,
numSkipTopPCs=10, mineig1e-3, hess=1, recenter=1, maxFixedIters=100, epsilon=1e-3, treatFixedAsRandom=False):
"""
No longer read in the bed file.
Default parameters set from the argparse section in the original code. Original code can be found
in:
https://github.com/omerwe/LEAP/blob/master/leap/probit.py
"""
#Extract phenotype
if isinstance(pheno, dict): phe = pheno['vals']
else: phe = pheno
if (len(phe.shape)==2):
if (phe.shape[1]==1): phe=phe[:,0]
else: raise Exception('More than one phenotype found')
if (keepArr is None): keepArr = np.ones(phe.shape[0], dtype=np.bool)
S = eigen['arr_1'] * bed.sid.shape[0]
U = eigen['arr_0']
S = np.sqrt(S)
goodS = (S>mineig)
if (numSkipTopPCs > 0): goodS[-numSkipTopPCs:] = False
if (np.sum(~goodS) > 0): print 'Removing', np.sum(~goodS), 'PCs with low variance'
G = U[:, goodS]*S[goodS]
#Set binary vector
pheUnique = np.unique(phe)
if (pheUnique.shape[0] != 2): raise Exception('phenotype file has more than two values')
pheMean = phe.mean()
cases = (phe>pheMean)
phe[~cases] = 0
phe[cases] = 1
#run probit regression
t = stats.norm(0,1).isf(prev)
if (thresholds is not None): t = thresholds
#Recenter G to only consider the unrelated individuals
if recenter: G -= np.mean(G[keepArr, :], axis=0)
else: G -= np.mean(G, axis=0)
numFixedFeatures = 0
if (covar is not None):
covar -= covar.mean()
covar /= covar.std()
covar *= np.mean(np.std(G, axis=0))
G = np.concatenate((covar, G), axis=1)
if (not treatFixedAsRandom): numFixedFeatures += covar.shape[1]
#Run Probit regression
probitThresh = (t if thresholds is None else t[keepArr])
beta = probitRegression(G[keepArr, :], phe[keepArr], probitThresh, bed.sid.shape[0], numFixedFeatures, h2, hess, maxFixedIters, epsilon, nofail)
#Predict liabilities for all individuals
meanLiab = G.dot(beta)
liab = meanLiab.copy()
indsToFlip = ((liab <= t) & (phe>0.5)) | ((liab > t) & (phe<0.5))
liab[indsToFlip] = stats.norm(0,1).isf(prev)
if (outFile is not None):
#save liabilities
f = open(outFile+'.liabs', 'w')
for ind_i,[fid,iid] in enumerate(bed.iid): f.write(' '.join([fid, iid, '%0.3f'%liab[ind_i]]) + '\n')
f.close()
#save liabilities after regressing out the fixed effects
if (numFixedFeatures > 0):
liab_nofixed = liab - G[:, :numFixedFeatures].dot(beta[:numFixedFeatures])
f = open(outFile+'.liab_nofixed', 'w')
for ind_i,[fid,iid] in enumerate(bed.iid): f.write(' '.join([fid, iid, '%0.3f'%liab_nofixed[ind_i]]) + '\n')
f.close()
liab_nofixed2 = meanLiab - G[:, :numFixedFeatures].dot(beta[:numFixedFeatures])
indsToFlip = ((liab_nofixed2 <= t) & (phe>0.5)) | ((liab_nofixed2 > t) & (phe<0.5))
liab_nofixed2[indsToFlip] = stats.norm(0,1).isf(prev)
f = open(outFile+'.liab_nofixed2', 'w')
for ind_i,[fid,iid] in enumerate(bed.iid): f.write(' '.join([fid, iid, '%0.3f'%liab_nofixed2[ind_i]]) + '\n')
f.close()
#Return phenotype struct with liabilities
liabsStruct = {
'header':[None],
'vals':liab,
'iid':bed.iid
}
return liabsStruct
'''
Out[12]:
In [ ]:
$5$. Test for associations
In [13]:
# Paper uses fastlmm.association.single_snp function
# Dependent on Python 2.7, will attempt statsmodel lmm
In [14]:
# Read in estimated liabilities
liabs = pd.read_csv("dataset1.phe.liab", sep=' ', header=None, engine='c')
liabs.columns = ['fam', 'person', 'liab']
liabs.set_index(keys = 'person', inplace=True)
liabs = liabs.ix[keepArr]
liabs.iloc[:5,:5]
Out[14]:
In [30]:
# Merge liabilities with snps
snps_estliabs = pd.concat([liabs, df], axis = 1)
snps_estliabs.iloc[:5,:5]
Out[30]:
In [32]:
Y = snps_estliabs.ix[:,1]
snps = snps_estliabs.ix[:,2:]
In [31]:
! pip install git+https://github.com/nickFurlotte/pylmm
In [34]:
from pylmm import lmm
TS,PS = lmm.GWAS(Y, snps, cov, REML = True, refit = True)
Run through LEAP pipeline for each chromosome (parts 2-5)
In [ ]: