In [1]:
!pip install plinkio
##Load data:
import os
import re
import numpy as np
import pandas as pd
from plinkio import plinkfile
import time
import sklearn.covariance as Cov
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from statsmodels.discrete.discrete_model import Probit
#from scipy.linalg.blas import dsyrk
#--can't find a way to get this working. Perhaps blas routines are missing.
data_path = '/home/jovyan/work/LEAP/leap/regression/dataset1'
os.chdir(data_path)
In [ ]:
"""
author: gene burinskiy
Goal:
Finding a set of individuals who are related to other individuals in the study.
LEAP employs a greedy algorithm to find a small subset of such individuals,
such that after their exclusion, there are no related individuals in the study.
These individuals are excluded from the analysis in stages 3 and 4 below,
but after fitting a model in stage 4, their liabilities are estimated along with
other indviduals. All individuals are considered in the GWAS stage (stage 5).
source for algorithm and methods. References are cited in functions.
https://github.com/omerwe/LEAP/blob/master/leap/regression/Leap_example.ipynb
I, in some cases, heavily modify or write new code to perform the tasks in the
Goal.
NOTES: this is an exploration file and includes code that would otherwise be redundant.
"""
In [2]:
##Load data:
bed = plinkfile.open("dataset1")
loci = bed.get_loci()
print("Length of locuses", len(loci))
chromosomes = np.unique([x.chromosome for x in loci])
print("# of chromosomes in data:",chromosomes)
samples = bed.get_samples()
print("Number of individuals in data:", len(samples))
In [4]:
##Place data into a dataframe:
mat = np.zeros((len(loci),len(samples)), dtype='int16') #1/4 of the taken up space by using int16
##don't know a faster method of extracting the data from the bed file.
i=0
for row in bed:
mat[i,:] = np.array([snp for snp in row])
i+=1
#this matrix is equivalent to transposed bed.val
print("Data type:", mat.dtype)
print("Size of bed matrix: %4.0fmb\n" %(mat.nbytes/(1024**2)))
#create a multi-indexed column space
tuples = [(x.chromosome,x.name) for x in loci]
ml_index = pd.MultiIndex.from_tuples(tuples, names = ['chromosome', 'snp'])
df = pd.DataFrame(mat.transpose(), columns=ml_index, index = [x.iid for x in bed.get_samples()])
df.info()
df.iloc[:5,:5]
Out[4]:
In [5]:
##compute covariance matrix between individuals, remove those who are too close to each other.
#they LEAP code uses dsyrk which halves the computational time. Alas, we can't use it y
df = df.astype('float64')-df.astype('float64').mean()
df.info()
dot_cov = np.dot(df, df.transpose())/(df.shape[1]-1) #having difficulties with scipy's linalg module
#note that the above takes more than half the time of np.cov
print("\nCovariance shape:" , dot_cov.shape)
print("Covariance memory usage in mb:", dot_cov.nbytes/(1024**2))
dot_cov[:5,:5]
Out[5]:
In [5]:
##trying to use a shrinkage estimator for the covariance matrix. Inspired by the computeCovar
##function in https://github.com/omerwe/LEAP/blob/master/leap/leapUtils.py
oa_fit = Cov.OAS().fit(df.transpose())
lw_fit = Cov.LedoitWolf().fit(df.transpose())
In [37]:
cutoff = .05
oa_cov = oa_fit.covariance_
bool_arr = np.tril(oa_cov, k=-1)>cutoff
y_idx,_ = np.where(bool_arr)
print("Under Oracle shrinkage:")
print("shape of y:",np.unique( y_idx).shape)
print(np.unique(y_idx))
In [38]:
np_cov = np.cov(df, ddof=0)
bool_arr = np.tril(np_cov, k=-1)>cutoff
y_idx,_ = np.where(bool_arr)
print("Under numpy cov:")
print("shape of y:", np.unique(y_idx).shape)
print(np.unique(y_idx))
In [41]:
lw_cov = lw_fit.covariance_
bool_arr = np.tril(lw_cov, k=-1)>cutoff
y_idx,_ = np.where(bool_arr)
print("Under Ledoit-Wolf estimate")
print("shape of y:", np.unique(y_idx).shape)
print(y_idx)
In [50]:
np_s, np_U = np.linalg.eigh(np_cov, 'L')
oa_s,oa_U = np.linalg.eigh(oa_cov, 'L')
lw_s, lw_U = np.linalg.eigh(lw_cov, 'L')
dot_s, dot_U = np.linalg.eigh(dot_cov, 'L')
In [51]:
print(np_s[:3])
print(oa_s[-3:])
print(lw_s[:3])
print(dot_s[:3])
In [6]:
cutoff = .05
bool_arr = np.tril(dot_cov, k=-1)>cutoff
y_idx,_ = np.where(bool_arr)
print("shape of y:", y_idx.shape)
print("\nremoving %d individuals" %y_idx.shape[0])
#note, they marked 54 so we marked more peeps, we effectively remove 56 rows. Something doesn't line up.
indxToKeep = set(range(dot_cov.shape[0]))
print("Original num of ppl:", len(indxToKeep))
[indxToKeep.remove(i) for i in np.unique(y_idx)]
keepArr = np.array(list(indxToKeep))
print("num of kept ppl:", keepArr.shape[0])
In [10]:
#exploring different ways to exclude individuals found above.
cov_m = np.ma.array(cov,mask=False)
cov_m.mask[y_idx,:] = True
cov_m.mask[:,y_idx] = True
print(cov_m.sum())
cov_c = np.delete(np.delete(cov, y_idx, axis=0), y_idx, axis=1)
print(cov_c.sum())
In [7]:
##Our calc_h2 function for Step 3
#uses the calc_h2.calc_h2 functions
from sklearn.linear_model import LogisticRegression
from scipy import stats
#read in phenofile:
phenos = pd.read_csv("dataset1.phe", sep=' ', header=None, engine='c')
phenos.columns = ['fam', 'person', 'pheno']
phenos.set_index(keys = 'person', inplace=True)
phenos.iloc[:5,:5]
Out[7]:
In [8]:
def calcLiabThresholds_3xx(U,s, keepArr, phe, numRemovePCs=10, prevalence = .001, covar=None):
"""
INPUTS:
1. U - left eigenvectors of covariance matrix (ie kinship matrix)
2. S - eigenvalues of covariance matrix (ie kinship matrix)
3. keepArr - np.array of indexes that exclude highly related individuals
4. phe - np.array of phenotypes (binary only)
5. covar - god knows. specified in author functions but remains undefined.
OUTPUT:
1. probs - probability estimates from a regularized logistic regression
2. threshold - no idea what this is, I assume they're estimated liabilities?
NOTES:
original code can be found on:
https://github.com/omerwe/LEAP/blob/master/leap/calc_h2.py
"""
##------------------------------------ CalcLiabThreshold -----------------------------------
##probs, thresholds = calcLiabThreholds(U, S, keepArr, phe, numRemovePCs, prevalence, covar)
#This is equivalent to an SVD decomposition; note their covar parameter is defaulted to None
G = U[:, -numRemovePCs:] * np.sqrt(s[-numRemovePCs:])
#perform a regularized logistic regression. I trust their parameter settings.
Logreg = LogisticRegression(penalty='l2', C=500000, fit_intercept=True)
Logreg.fit(G[keepArr, :], phe.iloc[keepArr])
#Compute individual thresholds
probs = Logreg.predict_proba(G)[:,1]
#Compute thresholds
prev = prevalence
P = np.sum(phe==1) / float(phe.shape[0])
#K = prev --why, why in the (insert explicative) hell do they do this?
Ki = prev*(1-prev) / (P*(1-prev)) * probs / (1 + prev*(1-prev) / (P*(1-prev))*probs - probs)
thresholds = stats.norm(0,1).isf(Ki)
thresholds[Ki>=1.] = -999999999
thresholds[Ki<=0.] = 999999999
return([probs, thresholds])
In [9]:
def calcH2Binary(XXT_o, phe_o, probs_o, thresholds_o, keepArr_o, prev, h2coeff):
"""
INPUT:
1. XXT - covariance matrix (kinship matrix) * number of snps
2. phe - np.array of phenotypes. In our case, they're binary.
3. probs - np.array of probabilities
4. thresholds - np.array of something (I believe they're estimated liabilities)
5. keepArr - np.array of indexes that exclude highly related individuals.
6. prev - prevalence
7. h2coeff - no idea. they set it to 1.0 for synthetic data. .875 otherwise
NOTES:
Many items have been removed for sake of more compact code. Namely, the actions if
thresholds is None.
Original code can be found on:
https://github.com/omerwe/LEAP/blob/master/leap/calc_h2.py
"""
K = prev
P = np.sum(phe_o>0) / float(phe_o.shape[0])
#index out individuals we do not want. In order to avoid reassining variables,
#I assign the input objects to new objects which are views.
XXT = XXT_o[np.ix_(keepArr, keepArr)]
phe = phe_o[keepArr]
probs = probs_o[keepArr]
thresholds = thresholds_o[keepArr]
Ki = K*(1-P) / (P*(1-K)) * probs / (1 + K*(1-P) / (P*(1-K))*probs - probs)
phit = stats.norm(0,1).pdf(thresholds)
probsInvOuter = np.outer(probs*(1-probs), probs*(1-probs))
y = np.outer(phe-probs, phe-probs) / np.sqrt(probsInvOuter)
sumProbs = np.tile(np.column_stack(probs).T, (1,probs.shape[0])) + np.tile(probs, (probs.shape[0], 1))
Atag0 = np.outer(phit, phit) * (1 - (sumProbs)*(P-K)/(P*(1-K)) + np.outer(probs, probs)*(((P-K)/(P*(1-K)))**2)) / np.sqrt(probsInvOuter)
B0 = np.outer(Ki + (1-Ki)*(K*(1-P))/(P*(1-K)), Ki + (1-Ki)*(K*(1-P))/(P*(1-K)))
x = (Atag0 / B0 * h2coeff) * XXT
y = y[np.triu_indices(y.shape[0], 1)]
x = x[np.triu_indices(x.shape[0], 1)]
slope, intercept, rValue, pValue, stdErr = stats.linregress(x,y)
return slope
In [10]:
import numpy as np
import sklearn.linear_model
import scipy.optimize as opt
def evalProbitReg(beta, X, cases, controls, thresholds, invRegParam, normPDF, h2):
"""
NOTES: not much to do here as everything is in numpy.
"""
XBeta = np.ravel(X.dot(beta)) - thresholds
phiXBeta = normPDF.pdf(XBeta)
PhiXBeta = normPDF.cdf(XBeta)
logLik = np.sum(np.log(PhiXBeta[cases])) + np.sum(np.log(1-PhiXBeta[controls]))
w = np.zeros(X.shape[0])
w[cases] = -phiXBeta[cases] / PhiXBeta[cases]
w[controls] = phiXBeta[controls] / (1-PhiXBeta[controls])
grad = X.T.dot(w)
#regularize
logLik -= 0.5*invRegParam * beta.dot(beta) #regularization
grad += invRegParam * beta
return (-logLik, grad)
def probitRegHessian(beta, X, cases, controls, thresholds, invRegParam, normPDF, h2):
"""
NOTES: not much to do here as everything is in numpy. Though, I precalculated
PhiXBeta and then subset that because it was originally done for each subset. It is, trivially,
faster to precompute the element-wise squaring and then subset.
"""
XBeta = np.ravel(X.dot(beta)) - thresholds
phiXBeta = normPDF.pdf(XBeta)
PhiXBeta = normPDF.cdf(XBeta)
XbetaScaled = XBeta #/(1-h2)
#PhiXBeta2 = np.square(PhiXBeta)
R = np.zeros(X.shape[0])
R[cases] = (XbetaScaled[cases]*PhiXBeta[cases] + phiXBeta[cases]) / PhiXBeta[cases]**2
R[controls] = (-XbetaScaled[controls]*(1-PhiXBeta[controls]) + phiXBeta[controls]) / (1 - PhiXBeta[controls])**2
R *= phiXBeta
H = (X.T * R).dot(X)
H += invRegParam
return H
def probitRegression(X, y, thresholds, numSNPs, numFixedFeatures, h2, useHess, maxFixedIters, epsilon, nofail):
"""
NOTE: If I had more time, I would probably use PyMC3 for this ... eventually. For now, just removed superfluous
parts.
0. print statement parantheses added.
1. Num of Fixed effects = 0 => delete fixed effect estimation code.
"""
regParam = h2 / float(numSNPs)
Linreg = sklearn.linear_model.Ridge(alpha=1.0/(2*regParam), fit_intercept=False, normalize=False, solver='lsqr')
Linreg.fit(X, y)
initBeta = Linreg.coef_
np.random.seed(1234)
normPDF = stats.norm(0, np.sqrt(1-h2))
invRegParam = 1.0/regParam
controls = (y==0)
cases = (y==1)
funcToSolve = evalProbitReg
hess =(probitRegHessian if useHess else None)
jac= True
method = 'Newton-CG'
args = (X, cases, controls, thresholds, invRegParam, normPDF, h2)
print('Beginning Probit regression...')
t0 = time.time()
optObj = opt.minimize(funcToSolve, x0=initBeta, args=args, jac=jac, method=method, hess=hess)
print('Done in', '%0.2f'%(time.time()-t0), 'seconds')
if (not optObj.success):
print('Optimization status:', optObj.status)
print(optObj.message)
if (nofail == 0): raise Exception('Probit regression failed with message: ' + optObj.message)
beta = optObj.x
return beta
def probit_3xx(df, phe, h2, prev, U,s, keepArr, thresholds=None, covar=None, nofail=0, outFile = None,
numSkipTopPCs=10, mineig=1e-3, hess=1, recenter=1, maxFixedIters=1e2, epsilon=1e-3, treatFixedAsRandom=False):
"""
INPUT:
1. df - pandas data frame of normalized snp values. df excludes current chromosome.
2. ph - pandas data frame of phenotypes
4. h2 - np.array of calculated something
5. prev - prevalence
6. U,S - left eigenvectors and eigenvalues.
7. thresholds - calculated thresholds.
Modifications:
1. No longer read in the bed, phenotype file,
2. no longer set binary phenotype cases.
3. get U and s directly, not from eigen dictionary
5. removed covar statement.
6. commented out saving of file as the current approach seems slow and requires interaction with the disk.
there are faster ways of saving these things BUT no time to fix that now until later. Also, just for
clarity, rewrote the code to adhere to modern Python practises. Finally, that section does not
seem to pertain to the output.
7. They use "structure" instead of dictionary -> C world.
8. Instead of a dictionary, I return a Pandas DataFrame ... because we <3 Pandas. Also, header was set to
None, so we remove it as output. Finally, we return directly, don't save it to LiabStructure or
liab_df in our case.
9. Replaced a few probitRegression inputs to adhere to our data structures.
Default parameters set from the argparse section in the original code. Original code can be found
in:
https://github.com/omerwe/LEAP/blob/master/leap/probit.py
"""
ncols = df.shape[1]
#run probit regression
t = stats.norm(0,1).isf(prev)
if (thresholds is not None): t = thresholds
S = np.sqrt(s*ncols)
goodS = (S>mineig)
if (numSkipTopPCs > 0): goodS[-numSkipTopPCs:] = False
if (np.sum(~goodS) > 0): print('\t\tRemoving', np.sum(~goodS), 'PCs with low variance')
G = U[:, goodS]*S[goodS]
#Recenter G to only consider the unrelated individuals
if recenter: G -= np.mean(G[keepArr, :], axis=0)
else: G -= np.mean(G, axis=0)
#Run Probit regression
numFixedFeatures = 0
probitThresh = (t if thresholds is None else t[keepArr])
#I believe bed.sid.shape is the sid_count. In our data, that is sid_count = df.shape[1]
beta = probitRegression(G[keepArr, :], phe[keepArr], probitThresh, ncols, numFixedFeatures, h2, hess, maxFixedIters, epsilon, nofail)
#predict probabilies
meanLiab = G.dot(beta)
liab = meanLiab.copy()
indsToFlip = np.logical_or( np.logical_and((liab <= t), (phe>0.5)), np.logical_and((liab > t),(phe<0.5)))
liab[indsToFlip] = stats.norm(0,1).isf(prev)
#Return phenotype dictionary with liabilities
return pd.DataFrame( {'vals':liab,'person':df.index})
In [11]:
#with multi-index, we index by using the number of the chromosome.
#This avoids copying of data -> we use views on the data. Immeasurably more efficient
prevalence = .001
numRemovePCs = 10
#chrom = chromosomes[2]
for chrom in chromosomes[:3]:
print("Working on chromosome: %s" %chrom)
exclude_chrom = set(chromosomes)
exclude_chrom.remove(chrom) #set all chromosomes except current
exclude_chrom = list(exclude_chrom)
t0 = time.time()
#Note that the original code puts cov, s, U into a dictionary called "eigen"
#They do not actually perform an SVD decomposition. Instead, they compute
#the covariance matrix, decompose that and use an equivalence relation between
#SVD and the decomposition of the covariance matrix.
#XXT = np.dot(df[exclude_chrom], df[exclude_chrom].transpose())
#s,U = np.linalg.eigh(XXT, 'L') #would use scipy except -again- can't get it to load.
XXT = Cov.OAS().fit(df[exclude_chrom].transpose()).covariance_
s,U = np.linalg.eigh(XXT, 'L')
#calc_h2 function
if numRemovePCs>0:
t_XXT = XXT - (U[:,-numRemovePCs:]*s[-numRemovePCs:]).dot(U[:,-numRemovePCs:].transpose())
pheUnique = np.unique(phenos.pheno)
isCaseControl = pheUnique.shape[0] == 2 #trivial condition for us
if ~np.all(pheUnique == np.array([0,1])):
pheMean = phenos.pheno.mean()
phenos.pheno[phenos.pheno <= pheMean] = 0
phenos.pheno[phenos.pheno> pheMean] = 1
probs, thresholds= calcLiabThresholds_3xx(U,s, keepArr, phenos.pheno)
#h2coeff = .875 but set to 1.0 for synthetic data.
h2 = calcH2Binary(t_XXT, phenos.pheno, probs, thresholds, keepArr, prev=prevalence, h2coeff=1.0)
print("\th2 values: %f" %h2)
liabs = probit_3xx(df, phenos.pheno, h2, prevalence, U,s, keepArr)
print("\t Took %.2f seconds" %(time.time()-t0))
In [ ]:
In [33]:
%%timeit
np.dot(df, df.transpose())/df.shape[1]
In [32]:
%%timeit
np.cov(df)