Implementation of LEAP algorithm



In [1]:

    
!pip install plinkio
import os
import re
import numpy as np
import pandas as pd
from plinkio import plinkfile
import time
#from scipy.linalg.blas import dsyrk 
    #--can't find a way to get this working. Perhaps blas routines are missing.
    
data_path = 'dataset1'
os.chdir(data_path)









    



Requirement already satisfied (use --upgrade to upgrade): plinkio in /opt/conda/lib/python3.4/site-packages



In [2]:

    
##Load data:
bed = plinkfile.open("dataset1")

loci = bed.get_loci()
print("Length of locuses", len(loci))
chromosomes = np.unique([x.chromosome for x in loci])
print("# of chromosomes in data:",chromosomes)

samples = bed.get_samples()
print("Number of individuals in data:", len(samples))









    



Length of locuses 10499
# of chromosomes in data: [ 1  2  3  4  5  6  7  8  9 10]
Number of individuals in data: 1000



In [3]:

    
##Place data into a dataframe:
mat = np.zeros((len(loci),len(samples)), dtype='int16') #1/4 of the taken up space by using int16

##don't know a faster method of extracting the data from the bed file.
i=0
for row in bed:
    mat[i,:] = np.array([snp for snp in row])
    i+=1
    
#this matrix is equivalent to transposed bed.val
print("Data type:", mat.dtype)
print("Size of bed matrix: %4.0fmb\n" %(mat.nbytes/(1024**2)))

#create a multi-indexed column space
tuples = [(x.chromosome,x.name) for x in loci]
ml_index = pd.MultiIndex.from_tuples(tuples, names = ['chromosome', 'snp'])

df = pd.DataFrame(mat.transpose(), columns=ml_index, index = [x.iid for x in bed.get_samples()]) 
df.info()
df.iloc[:5,:5]









    



Data type: int16
Size of bed matrix:   20mb

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, person1 to person1000
Columns: 10499 entries, (1, csnp18) to (10, snp10483)
dtypes: int16(10499)
memory usage: 20.0+ MB






    Out[3]:






  
    
      chromosome
      1
    
    
      snp
      csnp18
      csnp35
      csnp59
      csnp78
      csnp85
    
  
  
    
      person1
      2
      1
      1
      2
      1
    
    
      person2
      1
      0
      2
      2
      2
    
    
      person3
      0
      2
      2
      2
      2
    
    
      person4
      2
      1
      2
      2
      1
    
    
      person5
      0
      1
      2
      1
      2

$1$. Find and exclude related individuals (kinship coeff > 0.05)



In [4]:

    
##compute covariance matrix between individuals, remove those who are too close to each other.
#they LEAP code uses dsyrk which halves the computational time. Alas, we can't use it y

df = df.astype('float32')-df.astype('float32').mean() 
print(df.iloc[:5,:5])
df.info() #roughly doubled memory usage though still not the 80mb it was earlier

cov = np.dot(df, df.transpose())/df.shape[1] #having difficulties with scipy's linalg module
#note that the above takes more than half the time of np.cov
print("\nCovariance shape:" , cov.shape)
print("Covariance memory usage in mb:", cov.nbytes/(1024**2))
cov[:5,:5]









    



chromosome      1                            
snp        csnp18 csnp35 csnp59 csnp78 csnp85
person1      0.68 -0.191 -0.561  0.675 -0.734
person2     -0.32 -1.191  0.439  0.675  0.266
person3     -1.32  0.809  0.439  0.675  0.266
person4      0.68 -0.191  0.439  0.675 -0.734
person5     -1.32 -0.191  0.439 -0.325  0.266
<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, person1 to person1000
Columns: 10499 entries, (1, csnp18) to (10, snp10483)
dtypes: float32(10499)
memory usage: 40.1+ MB

Covariance shape: (1000, 1000)
Covariance memory usage in mb: 3.814697265625






    Out[4]:





array([[ 0.36813205,  0.00128837, -0.00865506, -0.00119463,  0.00389233],
       [ 0.00128837,  0.35822785,  0.00339447,  0.00228265,  0.00136904],
       [-0.00865506,  0.00339447,  0.36281952,  0.00443562, -0.00057362],
       [-0.00119463,  0.00228265,  0.00443562,  0.3630724 ,  0.00183871],
       [ 0.00389233,  0.00136904, -0.00057362,  0.00183871,  0.37096033]], dtype=float32)



In [5]:

    
cutoff = .05
bool_arr =  np.tril(cov, k=-1)>cutoff
y_idx,_ = np.where(bool_arr)
print("shape of y:", y_idx.shape)
print("\nremoving %d individuals" %y_idx.shape[0])

#note, they marked 54 so we marked more peeps, we effectively remove 47. Something doesn't line up.
indxToKeep = set(range(cov.shape[0]))
[indxToKeep.remove(i) for i in np.unique(y_idx)]
keepArr = np.array(list(indxToKeep))
keepArr.shape









    



shape of y: (56,)

removing 56 individuals






    Out[5]:





(953,)



In [6]:

    
# Keep nonrelated individuals
df = df.ix[keepArr]
df.shape









    Out[6]:





(953, 10499)

$2$. Compute an eigendecomposition of kinship matrix



In [7]:

    
import scipy.linalg as la
def eigendecomp(cov):
    s,U = la.eigh(cov)
    s[s<0]=0
    ind = np.argsort(s)
    ind = ind[s>1e-12]
    U = U[:,ind]
    s = s[ind]
    return s,U



In [8]:

    
eigendecomp(cov)









    Out[8]:





(array([ 0.13003808,  0.13165441,  0.13225257,  0.13357134,  0.134202  ,
         0.13490164,  0.1356812 ,  0.13685717,  0.13775316,  0.13862972,
         0.13921198,  0.13969547,  0.14085181,  0.14209104,  0.14282022,
         0.14348684,  0.14438701,  0.14474833,  0.14542264,  0.14601268,
         0.14653745,  0.14672716,  0.14760841,  0.1493122 ,  0.15024027,
         0.15256613,  0.15288413,  0.1536016 ,  0.1541388 ,  0.15472354,
         0.15549834,  0.15612823,  0.15738918,  0.15896249,  0.15990956,
         0.1607814 ,  0.16171797,  0.17439793,  0.17628135,  0.17690161,
         0.17855206,  0.17878345,  0.18021517,  0.18112618,  0.18126705,
         0.18168516,  0.18254504,  0.18303753,  0.18324444,  0.18430099,
         0.18432604,  0.18531649,  0.18567652,  0.18601348,  0.18693575,
         0.1876051 ,  0.18813165,  0.18849173,  0.18920457,  0.18943606,
         0.19016807,  0.19022518,  0.19078717,  0.19085954,  0.19132476,
         0.19228095,  0.19274095,  0.19305564,  0.19351494,  0.19408801,
         0.19449766,  0.19553411,  0.19579546,  0.19629866,  0.19669408,
         0.19694391,  0.19730124,  0.19761153,  0.19878522,  0.19906636,
         0.19930334,  0.1995407 ,  0.20000494,  0.20013253,  0.20049721,
         0.20103049,  0.20125155,  0.20182587,  0.20209984,  0.20233774,
         0.20271517,  0.20296846,  0.2037316 ,  0.20431548,  0.20446377,
         0.20551552,  0.20571229,  0.20623411,  0.20690836,  0.2072642 ,
         0.20773315,  0.2081676 ,  0.20852135,  0.20861225,  0.20900381,
         0.20910887,  0.20960861,  0.2102517 ,  0.2103439 ,  0.21048151,
         0.21074995,  0.21120974,  0.21184781,  0.21221671,  0.21254992,
         0.21295962,  0.21364959,  0.2137503 ,  0.2139509 ,  0.21458142,
         0.21465391,  0.21504696,  0.21557194,  0.21565099,  0.21590282,
         0.21611574,  0.21693638,  0.21778645,  0.21798153,  0.21827355,
         0.21851343,  0.21911953,  0.21946998,  0.22013626,  0.22048898,
         0.22081466,  0.22136195,  0.22187911,  0.22201252,  0.22215509,
         0.22290079,  0.22303943,  0.22337909,  0.22348753,  0.22363409,
         0.22426905,  0.22451711,  0.22487271,  0.22508635,  0.22571811,
         0.22612476,  0.22629049,  0.22666518,  0.2273896 ,  0.22757412,
         0.22783944,  0.22829191,  0.22858857,  0.22893079,  0.2293416 ,
         0.22952993,  0.22958757,  0.23044129,  0.23064648,  0.23095633,
         0.23125309,  0.23176238,  0.23209649,  0.23224838,  0.23329057,
         0.23363721,  0.23379926,  0.23429234,  0.23448993,  0.23490764,
         0.2350328 ,  0.23534344,  0.23551118,  0.23595457,  0.23613237,
         0.23643412,  0.23706058,  0.23740779,  0.23785728,  0.23803188,
         0.23825581,  0.23893258,  0.23906581,  0.23914583,  0.2398411 ,
         0.24013774,  0.24053791,  0.24079292,  0.24118526,  0.24152678,
         0.24157062,  0.24182034,  0.24206698,  0.24275213,  0.24281493,
         0.24285932,  0.24352205,  0.24363799,  0.24401653,  0.24452929,
         0.24465111,  0.24497409,  0.24547379,  0.24591361,  0.24662869,
         0.24675165,  0.24703521,  0.24732357,  0.24797124,  0.24821804,
         0.2483959 ,  0.24860018,  0.24898332,  0.24938455,  0.24962719,
         0.25002521,  0.25074589,  0.25091863,  0.25098771,  0.25117874,
         0.2516818 ,  0.25205144,  0.25249943,  0.25277436,  0.25328386,
         0.25339997,  0.25379586,  0.25398582,  0.25424477,  0.25481853,
         0.25501716,  0.25552881,  0.25593436,  0.25616285,  0.25642207,
         0.25718153,  0.25757962,  0.25780761,  0.25814942,  0.25855178,
         0.25910011,  0.2593163 ,  0.25958174,  0.25988582,  0.26018155,
         0.26036561,  0.26080289,  0.26127142,  0.26158005,  0.2618131 ,
         0.26239824,  0.26286188,  0.26288515,  0.2634024 ,  0.26351321,
         0.26411358,  0.26430961,  0.26441434,  0.26459926,  0.26509082,
         0.26574656,  0.26586744,  0.2662096 ,  0.26639357,  0.26665708,
         0.26746577,  0.26775864,  0.26833087,  0.26841253,  0.26846012,
         0.26876026,  0.26912701,  0.26943275,  0.26988918,  0.27049702,
         0.27068067,  0.27108055,  0.27121282,  0.27182645,  0.27223566,
         0.2724703 ,  0.27288204,  0.27322143,  0.27330425,  0.27346405,
         0.27405429,  0.27420312,  0.27482235,  0.27505648,  0.27533722,
         0.27552158,  0.27590647,  0.27628183,  0.27651054,  0.27715054,
         0.27738959,  0.27754533,  0.27824923,  0.27855301,  0.2788375 ,
         0.27957687,  0.27965829,  0.27995172,  0.28031173,  0.28067836,
         0.28107473,  0.2813789 ,  0.28139168,  0.2822372 ,  0.28236878,
         0.28263795,  0.28287074,  0.28308469,  0.2836974 ,  0.28390932,
         0.28430492,  0.28450978,  0.28463587,  0.28522235,  0.28556448,
         0.28572935,  0.28579593,  0.28646705,  0.28678337,  0.28738678,
         0.28797171,  0.28802705,  0.28859726,  0.28883556,  0.28903672,
         0.28937271,  0.28969634,  0.29006821,  0.29032686,  0.29071778,
         0.29085007,  0.2912651 ,  0.29161853,  0.29203638,  0.29207605,
         0.29253715,  0.29313073,  0.29364625,  0.29373047,  0.29379201,
         0.29438233,  0.29456294,  0.29484594,  0.29508907,  0.2958352 ,
         0.29606536,  0.29649618,  0.29678798,  0.29710239,  0.29759094,
         0.29804239,  0.29844403,  0.2989969 ,  0.29910421,  0.29954162,
         0.29976737,  0.30021387,  0.30047899,  0.30061275,  0.30098271,
         0.30134776,  0.30173793,  0.30217156,  0.30245405,  0.30277109,
         0.30306455,  0.30340236,  0.30378488,  0.30416405,  0.30449069,
         0.30488208,  0.30505869,  0.30566189,  0.30613729,  0.30638444,
         0.30676344,  0.30701441,  0.30726227,  0.30748138,  0.30807653,
         0.30842757,  0.30858052,  0.30873629,  0.30915216,  0.30972773,
         0.30981985,  0.31012371,  0.31037804,  0.31053427,  0.31160083,
         0.31178224,  0.31202883,  0.31259564,  0.31270143,  0.31338137,
         0.31393418,  0.31399289,  0.314567  ,  0.31491673,  0.31506684,
         0.31584731,  0.31620196,  0.31657383,  0.31700245,  0.31706926,
         0.31724626,  0.31760943,  0.31837255,  0.31851181,  0.31880996,
         0.31883731,  0.31955558,  0.31966937,  0.31987125,  0.3202042 ,
         0.32061875,  0.32136571,  0.32201195,  0.32224745,  0.32270634,
         0.32296833,  0.32312882,  0.32321399,  0.32374614,  0.32391134,
         0.32458723,  0.32493556,  0.32586604,  0.32599327,  0.32613832,
         0.32680911,  0.32701117,  0.32735571,  0.32745808,  0.3276518 ,
         0.32818469,  0.328522  ,  0.32872182,  0.32890832,  0.32913074,
         0.32972512,  0.33014423,  0.33027986,  0.33088112,  0.33104578,
         0.33143684,  0.33212638,  0.33231747,  0.33285797,  0.33315924,
         0.33338124,  0.33373284,  0.3341206 ,  0.33417541,  0.33456987,
         0.33473942,  0.33518437,  0.33536863,  0.33583331,  0.33617756,
         0.33707753,  0.33719325,  0.33787748,  0.3380059 ,  0.33805817,
         0.33813027,  0.33884698,  0.33918279,  0.33929077,  0.33991194,
         0.34061235,  0.34067506,  0.34116825,  0.34181026,  0.34200394,
         0.34231353,  0.34258592,  0.34268853,  0.34307611,  0.34351945,
         0.34356886,  0.34444752,  0.34511843,  0.34536028,  0.34603116,
         0.34617302,  0.34647152,  0.34674656,  0.34688506,  0.34732768,
         0.34794769,  0.34832492,  0.34871325,  0.34892669,  0.34935814,
         0.34957531,  0.3505609 ,  0.35077688,  0.35145265,  0.35164315,
         0.3520292 ,  0.35207546,  0.35219759,  0.35318235,  0.35361335,
         0.35388073,  0.35417166,  0.35452038,  0.35481945,  0.35544586,
         0.35585573,  0.35590863,  0.35631984,  0.3568306 ,  0.35709327,
         0.35746637,  0.35786149,  0.35822889,  0.35866249,  0.35907954,
         0.3595252 ,  0.35973099,  0.3601363 ,  0.36060685,  0.36082429,
         0.36113197,  0.36128095,  0.36214417,  0.36275232,  0.36298954,
         0.36336887,  0.36351097,  0.36383605,  0.36457458,  0.36488312,
         0.36564004,  0.36590341,  0.36611879,  0.36629525,  0.36667815,
         0.36733827,  0.36763027,  0.36766419,  0.3681913 ,  0.36851847,
         0.36894506,  0.36910692,  0.36980516,  0.37028283,  0.37074897,
         0.37089601,  0.37125966,  0.37191805,  0.37219918,  0.37272385,
         0.3730135 ,  0.37328792,  0.37370959,  0.37425697,  0.37464145,
         0.37475449,  0.37515989,  0.3754372 ,  0.37555391,  0.37651792,
         0.37718567,  0.37743697,  0.37765166,  0.37791559,  0.37842554,
         0.37890789,  0.37921095,  0.37955359,  0.37963101,  0.38023847,
         0.380631  ,  0.38157743,  0.38166103,  0.38231784,  0.38271499,
         0.3829011 ,  0.38339615,  0.38402164,  0.38408199,  0.38478231,
         0.38517869,  0.38579342,  0.38624117,  0.38675955,  0.38729048,
         0.38734302,  0.38793215,  0.38810217,  0.3885707 ,  0.38872364,
         0.38936225,  0.38953942,  0.38993147,  0.39035463,  0.39070037,
         0.39145392,  0.39164844,  0.39220214,  0.39247984,  0.39302149,
         0.39347389,  0.39389041,  0.39422566,  0.39487523,  0.39512157,
         0.3957724 ,  0.39609164,  0.39645934,  0.39714652,  0.3977021 ,
         0.39814401,  0.39844427,  0.39895269,  0.39928493,  0.39946002,
         0.4000527 ,  0.40035972,  0.40064803,  0.40125138,  0.401591  ,
         0.40179905,  0.40221813,  0.4026002 ,  0.40270665,  0.40321305,
         0.4035973 ,  0.40446383,  0.40493634,  0.40507519,  0.40575969,
         0.40606663,  0.40704751,  0.4074921 ,  0.4076775 ,  0.40792212,
         0.40809986,  0.40895337,  0.40935752,  0.41002056,  0.41024292,
         0.41090569,  0.41097063,  0.41166532,  0.41197333,  0.41200897,
         0.41271824,  0.41311905,  0.41326129,  0.41370857,  0.41420662,
         0.4151386 ,  0.41533446,  0.41560298,  0.41619438,  0.41654724,
         0.4175567 ,  0.41777861,  0.41837093,  0.41847283,  0.41880265,
         0.41901737,  0.41967171,  0.42039782,  0.42093146,  0.4210729 ,
         0.4213599 ,  0.42174903,  0.42227381,  0.42287993,  0.42314303,
         0.42381909,  0.42461503,  0.42469153,  0.42487794,  0.42584983,
         0.42672226,  0.42708823,  0.42765924,  0.42793   ,  0.42836326,
         0.42912045,  0.42923763,  0.429741  ,  0.43005812,  0.43098128,
         0.43121719,  0.43151698,  0.43179795,  0.43208742,  0.43299699,
         0.43337095,  0.43392664,  0.43478772,  0.43491176,  0.43533838,
         0.43577147,  0.4368484 ,  0.43705752,  0.43749636,  0.43767917,
         0.43808398,  0.43820345,  0.4388392 ,  0.43927047,  0.4393481 ,
         0.44022101,  0.44099903,  0.4412801 ,  0.44196644,  0.44251636,
         0.44309175,  0.44342822,  0.44389135,  0.44439453,  0.44493899,
         0.44554883,  0.44624925,  0.44668582,  0.44733837,  0.44754335,
         0.44788444,  0.44828957,  0.44868225,  0.44935936,  0.44998255,
         0.45027405,  0.45091659,  0.4513042 ,  0.4516688 ,  0.452333  ,
         0.45302886,  0.45338261,  0.45396602,  0.45424399,  0.45446014,
         0.4550437 ,  0.45540676,  0.45574996,  0.45692688,  0.45728514,
         0.45787555,  0.45823035,  0.45897496,  0.45974731,  0.4598574 ,
         0.46043104,  0.46075448,  0.46123239,  0.4618147 ,  0.46271503,
         0.46284071,  0.46308604,  0.46385628,  0.46439523,  0.46480909,
         0.46518523,  0.46575069,  0.46655107,  0.46701375,  0.46766028,
         0.46798521,  0.46846315,  0.46901843,  0.46959656,  0.4699021 ,
         0.47035155,  0.47073445,  0.47106791,  0.47224393,  0.47247475,
         0.47294128,  0.47384053,  0.47407639,  0.47484791,  0.47565982,
         0.47610742,  0.47631231,  0.47661737,  0.47718805,  0.47728333,
         0.47927329,  0.47931591,  0.47981843,  0.47992915,  0.48100641,
         0.48123267,  0.48154023,  0.48242784,  0.48260227,  0.48349616,
         0.48462957,  0.48506671,  0.48560479,  0.4859159 ,  0.48598176,
         0.48694643,  0.48768145,  0.48782089,  0.48842257,  0.4894304 ,
         0.48992112,  0.49022999,  0.49084383,  0.49242038,  0.4925223 ,
         0.49334022,  0.49372005,  0.49429399,  0.4948228 ,  0.49587902,
         0.49633965,  0.4964284 ,  0.49683854,  0.497199  ,  0.49836707,
         0.49869981,  0.49935409,  0.50026608,  0.50095892,  0.50161052,
         0.50207925,  0.50252843,  0.50313151,  0.5039252 ,  0.50410372,
         0.50467646,  0.50576288,  0.50583351,  0.50669098,  0.50693631,
         0.50734705,  0.50829148,  0.50875849,  0.50926   ,  0.51023054,
         0.51047826,  0.51128703,  0.5115357 ,  0.51220787,  0.5125733 ,
         0.51380461,  0.51449251,  0.51535946,  0.51572692,  0.51614547,
         0.51713204,  0.51732236,  0.51785493,  0.51899117,  0.51972449,
         0.52044374,  0.52135402,  0.52167773,  0.52294457,  0.52359807,
         0.52424359,  0.5247528 ,  0.52555728,  0.52663165,  0.52752221,
         0.52788723,  0.52935839,  0.52942765,  0.52969062,  0.53059751,
         0.53075755,  0.53175616,  0.53217387,  0.53321272,  0.53367299,
         0.53402394,  0.53498721,  0.53587973,  0.53649086,  0.53666377,
         0.53761536,  0.53892577,  0.53958869,  0.5403142 ,  0.54074997,
         0.54237771,  0.54265708,  0.54323196,  0.5433383 ,  0.5443359 ,
         0.54500276,  0.54624379,  0.54718494,  0.54839486,  0.5494681 ,
         0.55018979,  0.55127555,  0.55235326,  0.55335581,  0.55466932,
         0.55518293,  0.55593461,  0.55614585,  0.55679327,  0.55768472,
         0.5583362 ,  0.55871505,  0.55945969,  0.56101453,  0.56199336,
         0.56261325,  0.56332231,  0.56433755,  0.5659973 ,  0.5670228 ,
         0.56779844,  0.56942326,  0.56980401,  0.57060623,  0.57169431,
         0.57217997,  0.57388687,  0.57544082,  0.57625842,  0.57729918,
         0.57900369,  0.57934916,  0.58049369,  0.58074284,  0.58244652,
         0.583215  ,  0.58389229,  0.58472151,  0.5850594 ,  0.58653879,
         0.58863539,  0.58908761,  0.58984017,  0.59100127,  0.59285539,
         0.5939548 ,  0.59606338,  0.5976193 ,  0.59890091,  0.60132724,
         0.60178244,  0.60296947,  0.60511893,  0.60566163,  0.60710335,
         0.60972923,  0.61073554,  0.61279911,  0.61675727,  0.61921388,
         0.61977226,  0.62317485,  0.62597847,  0.62772852,  0.63009834,
         0.63083643,  0.63499767,  0.63748288,  0.64176404,  0.64204502,
         0.64551336,  0.64912403,  0.65254724,  0.65571773,  0.65774286,
         0.66005778,  0.6633262 ,  0.66546273,  0.66769803,  0.66967571,
         0.67169952,  0.67954987,  0.68394691,  0.68766671,  0.69383407,
         0.69896692,  0.70240015,  0.71257031,  0.71942919,  0.72242504,
         0.72712612,  0.7315377 ,  0.74206257,  3.89720273], dtype=float32),
 array([[-0.1114863 ,  0.14161271, -0.05359431, ..., -0.04864265,
          0.00779281, -0.0228936 ],
        [-0.02791387, -0.00571705,  0.01584268, ...,  0.02580418,
          0.00196977, -0.02300215],
        [ 0.00885374, -0.02058837, -0.01454659, ..., -0.01295871,
          0.00076276, -0.01788765],
        ..., 
        [-0.00437262, -0.03478216,  0.0183023 , ..., -0.00884257,
          0.01521734,  0.04180955],
        [ 0.0099821 , -0.01132777,  0.00154827, ...,  0.00675029,
          0.01960225,  0.03510651],
        [ 0.00879363, -0.01320999,  0.0256143 , ...,  0.00410632,
         -0.00407846,  0.0474275 ]], dtype=float32))

$3$. Compute heritability (h2) using the method of Golan et al.



In [9]:

    
from sklearn.linear_model import LogisticRegression
from scipy import stats

#read in phenofile:
phenos = pd.read_csv("dataset1.phe", sep=' ', header=None, engine='c')
phenos.columns = ['fam', 'person', 'pheno']
phenos.set_index(keys = 'person', inplace=True)
phenos.iloc[:5,:5]









    Out[9]:






  
    
      
      fam
      pheno
    
    
      person
      
      
    
  
  
    
      person1
      FAM1
      0
    
    
      person2
      FAM1
      0
    
    
      person3
      FAM1
      0
    
    
      person4
      FAM1
      0
    
    
      person5
      FAM1
      0



In [10]:

    
def calcH2Binary(XXT_o, phe_o, probs_o, thresholds_o, keepArr_o, prev, h2coeff):
    """
    INPUT:
        1. XXT - covariance matrix (kinship matrix)
        2. phe - np.array of phenotypes. In our case, they're binary.
        3. probs - np.array of probabilities
        4. thresholds - np.array of something (I believe they're estimated liabilities)
        5. keepArr - np.array of indexes that exclude highly related individuals.
        6. prev - prevalence
        7. h2coeff - no idea. they set it to 1.0
    NOTES:
        Many items have been removed for sake of more compact code. Namely, the actions if
        thresholds is None. 
        Original code can be found on:
        https://github.com/omerwe/LEAP/blob/master/leap/calc_h2.py
    """
    K = prev
    P = np.sum(phe_o>0) / float(phe_o.shape[0])
    
    #index out individuals we do not want. In order to avoid reassining variables,
    #I assign the input objects to new objects which are views.
    XXT = XXT_o[np.ix_(keepArr, keepArr)]
    phe = phe_o[keepArr]

    probs = probs_o[keepArr]
    thresholds = thresholds_o[keepArr]
    
    Ki = K*(1-P) / (P*(1-K)) * probs / (1 + K*(1-P) / (P*(1-K))*probs - probs)
    phit = stats.norm(0,1).pdf(thresholds)
    probsInvOuter = np.outer(probs*(1-probs), probs*(1-probs))
    y = np.outer(phe-probs, phe-probs) / np.sqrt(probsInvOuter)	
    sumProbs = np.tile(np.column_stack(probs).T, (1,probs.shape[0])) + np.tile(probs, (probs.shape[0], 1))
    Atag0 = np.outer(phit, phit) * (1 - (sumProbs)*(P-K)/(P*(1-K)) + np.outer(probs, probs)*(((P-K)/(P*(1-K)))**2)) / np.sqrt(probsInvOuter)
    B0 = np.outer(Ki + (1-Ki)*(K*(1-P))/(P*(1-K)), Ki + (1-Ki)*(K*(1-P))/(P*(1-K)))
    x = (Atag0 / B0 * h2coeff) * XXT

    y = y[np.triu_indices(y.shape[0], 1)]
    x = x[np.triu_indices(x.shape[0], 1)]

    slope, intercept, rValue, pValue, stdErr = stats.linregress(x,y)
    
    return slope



In [11]:

    
def calcLiabThresholds_3xx(U,s, keepArr, phe, numRemovePCs=10, prevalence = .001, covar=None): 
    """
    INPUTS:
        1. U - left eigenvectors of covariance matrix (ie kinship matrix)
        2. S - eigenvalues of covariance matrix (ie kinship matrix)
        3. keepArr - np.array of indexes that exclude highly related individuals
        4. phe - np.array of phenotypes (binary only)
        5. covar - god knows. specified in author functions but remains undefined. 
    OUTPUT:
        1. probs - probability estimates from a regularized logistic regression
        2. threshold - no idea what this is, I assume they're estimated liabilities?
    NOTES:
        original code can be found on:
        https://github.com/omerwe/LEAP/blob/master/leap/calc_h2.py
    """
    #another part of the calc_h2 function
    prev=prevalence

    numRemovePCs=10 #their default value; as far as I'm aware, they do not input different values

    if numRemovePCs>0:
        t_cov = cov -  (U[:,-numRemovePCs:]*s[-numRemovePCs:]).dot(U[:,-numRemovePCs:].transpose())

    pheUnique = np.unique(phe)
    isCaseControl = pheUnique.shape[0] == 2 #trivial condition for us

    if ~np.all(pheUnique == np.array([0,1])):
        pheMean = phe.mean()
        phe[phe <= pheMean] = 0
        phe[phe> pheMean] = 1

    #probs, thresholds = calcLiabThreholds(U, S, keepArr, phe, numRemovePCs, prevalence, covar) 

    #This is equivalent to an SVD decomposition; note their covar parameter is defaulted to None
    G = U[:, -numRemovePCs:] * np.sqrt(s[-numRemovePCs:])

    #perform a regularized logistic regression. I trust their parameter settings.
    Logreg = LogisticRegression(penalty='l2', C=500000, fit_intercept=True)
    Logreg.fit(G[keepArr, :], phe.iloc[keepArr])

    #Compute individual thresholds
    probs = Logreg.predict_proba(G)[:,1]

    #Compute thresholds
    P = np.sum(phe==1) / float(phe.shape[0])
    #K = prev --why, why in the (insert explicative) hell do they do this?
    Ki = prev*(1-prev) / (P*(1-prev)) * probs / (1 + prev*(1-prev) / (P*(1-prev))*probs - probs)
    thresholds = stats.norm(0,1).isf(Ki)
    thresholds[Ki>=1.] = -999999999
    thresholds[Ki<=0.] = 999999999
    
    return([probs, thresholds])

$4$. Estimate liabilities



In [12]:

    
import numpy as np
import sklearn.linear_model
import scipy.optimize as opt

# From LEAP documentation
'''
def evalProbitReg(beta, X, cases, controls, thresholds, invRegParam, normPDF, h2):
    """
    NOTES: not much to do here as everything is in numpy. 
    """
    XBeta = np.ravel(X.dot(beta)) - thresholds
    phiXBeta = normPDF.pdf(XBeta)
    PhiXBeta = normPDF.cdf(XBeta)

    logLik = np.sum(np.log(PhiXBeta[cases])) + np.sum(np.log(1-PhiXBeta[controls]))	
    w = np.zeros(X.shape[0])
    w[cases] = -phiXBeta[cases] / PhiXBeta[cases]
    w[controls] = phiXBeta[controls] / (1-PhiXBeta[controls])
    grad = X.T.dot(w)

    #regularize
    logLik -= 0.5*invRegParam * beta.dot(beta)	#regularization	
    grad += invRegParam * beta
    return [-logLik, grad]

def probitRegHessian(beta, X, cases, controls, thresholds, invRegParam, normPDF, h2):
    """
    NOTES: not much to do here as everything is in numpy. Though, I precalculated
    PhiXBeta and then subset that because it was originally done for each subset. It is, trivially,
    faster to precompute the element-wise squaring and then subset. 
    """
    XBeta = np.ravel(X.dot(beta)) - thresholds
    phiXBeta = normPDF.pdf(XBeta)
    PhiXBeta = normPDF.cdf(XBeta)

    XbetaScaled = XBeta #/(1-h2)

    PhiXBeta2 = np.square(PhiXBeta)
    R = np.zeros(X.shape[0])
    R[cases] = (XbetaScaled[cases]*PhiXBeta[cases] + phiXBeta[cases]) / PhiXBeta2[cases]
    R[controls] = (-XbetaScaled[controls]*(1-PhiXBeta[controls]) + phiXBeta[controls]) / (1 - PhiXBeta2[controls])

    R *= phiXBeta
    H = (X.T * R).dot(X)
    H += invRegParam
    return H


def probitRegression(X, y, thresholds, numSNPs, numFixedFeatures, h2, useHess, maxFixedIters, epsilon, nofail):
    """
    If I had more time, I would probably use PyMC3 for this ... eventually. For now, just removed superfluous
    parts. Can also cythonize the loop in "Fit fixed effects" -- for later. 
    """
    regParam = h2 /  float(numSNPs)	
    Linreg = sklearn.linear_model.Ridge(alpha=1.0/(2*regParam), fit_intercept=False, normalize=False, solver='lsqr')
    Linreg.fit(X, y)
    initBeta = Linreg.coef_
    np.random.seed(1234)

    normPDF = stats.norm(0, np.sqrt(1-h2))
    invRegParam = 1.0/regParam		
    controls = (y==0)
    cases = (y==1)
    funcToSolve = evalProbitReg
    hess =(probitRegHessian if useHess else None)
    jac= True
    method = 'Newton-CG'
    args = (X, cases, controls, thresholds, invRegParam, normPDF, h2)
    print 'Beginning Probit regression...'
    t0 = time.time()
    optObj = opt.minimize(funcToSolve, x0=initBeta, args=args, jac=jac, method=method, hess=hess)
    print 'Done in', '%0.2f'%(time.time()-t0), 'seconds'	
    if (not optObj.success):
        print 'Optimization status:', optObj.status
        print optObj.message
        if (nofail == 0): raise Exception('Probit regression failed with message: ' + optObj.message)
    beta = optObj.x

    #Fit fixed effects
    if (numFixedFeatures > 0):
        thresholdsEM = np.zeros(X.shape[0]) + thresholds

        for i in xrange(maxFixedIters):
            print 'Beginning fixed effects iteration', i+1
            t0 = time.time()
            prevBeta = beta.copy()

            #Learn fixed effects			
            thresholdsTemp = thresholdsEM - X[:, numFixedFeatures:].dot(beta[numFixedFeatures:])						
            args = (X[:, :numFixedFeatures], cases, controls, thresholdsTemp, 0, normPDF, h2)

            optObj = opt.minimize(funcToSolve, x0=beta[:numFixedFeatures], args=args, jac=True, method=method, hess=hess)
            if (not optObj.success): print optObj.message; #raise Exception('Learning failed with message: ' + optObj.message)
            beta[:numFixedFeatures] = optObj.x

            #Learn random effects
            thresholdsTemp = thresholdsEM - X[:, :numFixedFeatures].dot(beta[:numFixedFeatures])			
            args = (X[:, numFixedFeatures:], cases, controls, thresholdsTemp, invRegParam, normPDF, h2)
            optObj = opt.minimize(funcToSolve, x0=beta[numFixedFeatures:], args=args, jac=True, method=method, hess=hess)
            if (not optObj.success): print optObj.message; #raise Exception('Learning failed with message: ' + optObj.message)				
            beta[numFixedFeatures:] = optObj.x

            diff = np.sqrt(np.mean(beta[:numFixedFeatures]**2 - prevBeta[:numFixedFeatures]**2))
            print 'Done in', '%0.2f'%(time.time()-t0), 'seconds'
            print 'Diff:', '%0.4e'%diff
            if (diff < epsilon): break
    return beta
	


def probit(bed, pheno, h2, prev, eigen, outFile, keepArr, thresholds,covar=None,  nofail=0,
    numSkipTopPCs=10, mineig1e-3, hess=1, recenter=1, maxFixedIters=100, epsilon=1e-3, treatFixedAsRandom=False):
    """
    No longer read in the bed file. 
    Default parameters set from the argparse section in the original code. Original code can be found
    in:
    https://github.com/omerwe/LEAP/blob/master/leap/probit.py
    """
	#Extract phenotype
	if isinstance(pheno, dict):	phe = pheno['vals']
	else: phe = pheno		
	if (len(phe.shape)==2):
		if (phe.shape[1]==1): phe=phe[:,0]
		else: raise Exception('More than one phenotype found')		
	if (keepArr is None): keepArr = np.ones(phe.shape[0], dtype=np.bool)				
				
	S = eigen['arr_1'] * bed.sid.shape[0]
	U = eigen['arr_0']
	S = np.sqrt(S)
	goodS = (S>mineig)
	if (numSkipTopPCs > 0): goodS[-numSkipTopPCs:] = False
	if (np.sum(~goodS) > 0): print 'Removing', np.sum(~goodS), 'PCs with low variance'	
	G = U[:, goodS]*S[goodS]
	
	#Set binary vector
	pheUnique = np.unique(phe)
	if (pheUnique.shape[0] != 2): raise Exception('phenotype file has more than two values')
	pheMean = phe.mean()
	cases = (phe>pheMean)
	phe[~cases] = 0
	phe[cases] = 1

	#run probit regression
	t = stats.norm(0,1).isf(prev)
	if (thresholds is not None): t = thresholds

	#Recenter G	to only consider the unrelated individuals
	if recenter: G -= np.mean(G[keepArr, :], axis=0)
	else: G -= np.mean(G, axis=0)
	
	numFixedFeatures = 0
	if (covar is not None):
		covar -= covar.mean()
		covar /= covar.std()
		covar *= np.mean(np.std(G, axis=0))
		G = np.concatenate((covar, G), axis=1)
		if (not treatFixedAsRandom): numFixedFeatures += covar.shape[1]

	#Run Probit regression
	probitThresh = (t if thresholds is None else t[keepArr])
	beta = probitRegression(G[keepArr, :], phe[keepArr], probitThresh, bed.sid.shape[0], numFixedFeatures, h2, hess, maxFixedIters, epsilon, nofail)

	#Predict liabilities for all individuals
	meanLiab = G.dot(beta)		
	liab = meanLiab.copy()
	indsToFlip = ((liab <= t) & (phe>0.5)) | ((liab > t) & (phe<0.5))
	liab[indsToFlip] = stats.norm(0,1).isf(prev)
	
	if (outFile is not None):
		#save liabilities
		f = open(outFile+'.liabs', 'w')
		for ind_i,[fid,iid] in enumerate(bed.iid): f.write(' '.join([fid, iid, '%0.3f'%liab[ind_i]]) + '\n')		
		f.close()

		#save liabilities after regressing out the fixed effects
		if (numFixedFeatures > 0):
			liab_nofixed = liab - G[:, :numFixedFeatures].dot(beta[:numFixedFeatures])
			f = open(outFile+'.liab_nofixed', 'w')
			for ind_i,[fid,iid] in enumerate(bed.iid): f.write(' '.join([fid, iid, '%0.3f'%liab_nofixed[ind_i]]) + '\n')		
			f.close()
			
			liab_nofixed2 = meanLiab - G[:, :numFixedFeatures].dot(beta[:numFixedFeatures])
			indsToFlip = ((liab_nofixed2 <= t) & (phe>0.5)) | ((liab_nofixed2 > t) & (phe<0.5))
			liab_nofixed2[indsToFlip] = stats.norm(0,1).isf(prev)
			f = open(outFile+'.liab_nofixed2', 'w')
			for ind_i,[fid,iid] in enumerate(bed.iid): f.write(' '.join([fid, iid, '%0.3f'%liab_nofixed2[ind_i]]) + '\n')		
			f.close()	
			
	#Return phenotype struct with liabilities
	liabsStruct = {
		'header':[None],
		'vals':liab,
		'iid':bed.iid
	}
	return liabsStruct
'''









    Out[12]:





'\ndef evalProbitReg(beta, X, cases, controls, thresholds, invRegParam, normPDF, h2):\n    """\n    NOTES: not much to do here as everything is in numpy. \n    """\n    XBeta = np.ravel(X.dot(beta)) - thresholds\n    phiXBeta = normPDF.pdf(XBeta)\n    PhiXBeta = normPDF.cdf(XBeta)\n\n    logLik = np.sum(np.log(PhiXBeta[cases])) + np.sum(np.log(1-PhiXBeta[controls]))\t\n    w = np.zeros(X.shape[0])\n    w[cases] = -phiXBeta[cases] / PhiXBeta[cases]\n    w[controls] = phiXBeta[controls] / (1-PhiXBeta[controls])\n    grad = X.T.dot(w)\n\n    #regularize\n    logLik -= 0.5*invRegParam * beta.dot(beta)\t#regularization\t\n    grad += invRegParam * beta\n    return [-logLik, grad]\n\ndef probitRegHessian(beta, X, cases, controls, thresholds, invRegParam, normPDF, h2):\n    """\n    NOTES: not much to do here as everything is in numpy. Though, I precalculated\n    PhiXBeta and then subset that because it was originally done for each subset. It is, trivially,\n    faster to precompute the element-wise squaring and then subset. \n    """\n    XBeta = np.ravel(X.dot(beta)) - thresholds\n    phiXBeta = normPDF.pdf(XBeta)\n    PhiXBeta = normPDF.cdf(XBeta)\n\n    XbetaScaled = XBeta #/(1-h2)\n\n    PhiXBeta2 = np.square(PhiXBeta)\n    R = np.zeros(X.shape[0])\n    R[cases] = (XbetaScaled[cases]*PhiXBeta[cases] + phiXBeta[cases]) / PhiXBeta2[cases]\n    R[controls] = (-XbetaScaled[controls]*(1-PhiXBeta[controls]) + phiXBeta[controls]) / (1 - PhiXBeta2[controls])\n\n    R *= phiXBeta\n    H = (X.T * R).dot(X)\n    H += invRegParam\n    return H\n\n\ndef probitRegression(X, y, thresholds, numSNPs, numFixedFeatures, h2, useHess, maxFixedIters, epsilon, nofail):\n    """\n    If I had more time, I would probably use PyMC3 for this ... eventually. For now, just removed superfluous\n    parts. Can also cythonize the loop in "Fit fixed effects" -- for later. \n    """\n    regParam = h2 /  float(numSNPs)\t\n    Linreg = sklearn.linear_model.Ridge(alpha=1.0/(2*regParam), fit_intercept=False, normalize=False, solver=\'lsqr\')\n    Linreg.fit(X, y)\n    initBeta = Linreg.coef_\n    np.random.seed(1234)\n\n    normPDF = stats.norm(0, np.sqrt(1-h2))\n    invRegParam = 1.0/regParam\t\t\n    controls = (y==0)\n    cases = (y==1)\n    funcToSolve = evalProbitReg\n    hess =(probitRegHessian if useHess else None)\n    jac= True\n    method = \'Newton-CG\'\n    args = (X, cases, controls, thresholds, invRegParam, normPDF, h2)\n    print \'Beginning Probit regression...\'\n    t0 = time.time()\n    optObj = opt.minimize(funcToSolve, x0=initBeta, args=args, jac=jac, method=method, hess=hess)\n    print \'Done in\', \'%0.2f\'%(time.time()-t0), \'seconds\'\t\n    if (not optObj.success):\n        print \'Optimization status:\', optObj.status\n        print optObj.message\n        if (nofail == 0): raise Exception(\'Probit regression failed with message: \' + optObj.message)\n    beta = optObj.x\n\n    #Fit fixed effects\n    if (numFixedFeatures > 0):\n        thresholdsEM = np.zeros(X.shape[0]) + thresholds\n\n        for i in xrange(maxFixedIters):\n            print \'Beginning fixed effects iteration\', i+1\n            t0 = time.time()\n            prevBeta = beta.copy()\n\n            #Learn fixed effects\t\t\t\n            thresholdsTemp = thresholdsEM - X[:, numFixedFeatures:].dot(beta[numFixedFeatures:])\t\t\t\t\t\t\n            args = (X[:, :numFixedFeatures], cases, controls, thresholdsTemp, 0, normPDF, h2)\n\n            optObj = opt.minimize(funcToSolve, x0=beta[:numFixedFeatures], args=args, jac=True, method=method, hess=hess)\n            if (not optObj.success): print optObj.message; #raise Exception(\'Learning failed with message: \' + optObj.message)\n            beta[:numFixedFeatures] = optObj.x\n\n            #Learn random effects\n            thresholdsTemp = thresholdsEM - X[:, :numFixedFeatures].dot(beta[:numFixedFeatures])\t\t\t\n            args = (X[:, numFixedFeatures:], cases, controls, thresholdsTemp, invRegParam, normPDF, h2)\n            optObj = opt.minimize(funcToSolve, x0=beta[numFixedFeatures:], args=args, jac=True, method=method, hess=hess)\n            if (not optObj.success): print optObj.message; #raise Exception(\'Learning failed with message: \' + optObj.message)\t\t\t\t\n            beta[numFixedFeatures:] = optObj.x\n\n            diff = np.sqrt(np.mean(beta[:numFixedFeatures]**2 - prevBeta[:numFixedFeatures]**2))\n            print \'Done in\', \'%0.2f\'%(time.time()-t0), \'seconds\'\n            print \'Diff:\', \'%0.4e\'%diff\n            if (diff < epsilon): break\n    return beta\n\t\n\n\ndef probit(bed, pheno, h2, prev, eigen, outFile, keepArr, thresholds,covar=None,  nofail=0,\n    numSkipTopPCs=10, mineig1e-3, hess=1, recenter=1, maxFixedIters=100, epsilon=1e-3, treatFixedAsRandom=False):\n    """\n    No longer read in the bed file. \n    Default parameters set from the argparse section in the original code. Original code can be found\n    in:\n    https://github.com/omerwe/LEAP/blob/master/leap/probit.py\n    """\n\t#Extract phenotype\n\tif isinstance(pheno, dict):\tphe = pheno[\'vals\']\n\telse: phe = pheno\t\t\n\tif (len(phe.shape)==2):\n\t\tif (phe.shape[1]==1): phe=phe[:,0]\n\t\telse: raise Exception(\'More than one phenotype found\')\t\t\n\tif (keepArr is None): keepArr = np.ones(phe.shape[0], dtype=np.bool)\t\t\t\t\n\t\t\t\t\n\tS = eigen[\'arr_1\'] * bed.sid.shape[0]\n\tU = eigen[\'arr_0\']\n\tS = np.sqrt(S)\n\tgoodS = (S>mineig)\n\tif (numSkipTopPCs > 0): goodS[-numSkipTopPCs:] = False\n\tif (np.sum(~goodS) > 0): print \'Removing\', np.sum(~goodS), \'PCs with low variance\'\t\n\tG = U[:, goodS]*S[goodS]\n\t\n\t#Set binary vector\n\tpheUnique = np.unique(phe)\n\tif (pheUnique.shape[0] != 2): raise Exception(\'phenotype file has more than two values\')\n\tpheMean = phe.mean()\n\tcases = (phe>pheMean)\n\tphe[~cases] = 0\n\tphe[cases] = 1\n\n\t#run probit regression\n\tt = stats.norm(0,1).isf(prev)\n\tif (thresholds is not None): t = thresholds\n\n\t#Recenter G\tto only consider the unrelated individuals\n\tif recenter: G -= np.mean(G[keepArr, :], axis=0)\n\telse: G -= np.mean(G, axis=0)\n\t\n\tnumFixedFeatures = 0\n\tif (covar is not None):\n\t\tcovar -= covar.mean()\n\t\tcovar /= covar.std()\n\t\tcovar *= np.mean(np.std(G, axis=0))\n\t\tG = np.concatenate((covar, G), axis=1)\n\t\tif (not treatFixedAsRandom): numFixedFeatures += covar.shape[1]\n\n\t#Run Probit regression\n\tprobitThresh = (t if thresholds is None else t[keepArr])\n\tbeta = probitRegression(G[keepArr, :], phe[keepArr], probitThresh, bed.sid.shape[0], numFixedFeatures, h2, hess, maxFixedIters, epsilon, nofail)\n\n\t#Predict liabilities for all individuals\n\tmeanLiab = G.dot(beta)\t\t\n\tliab = meanLiab.copy()\n\tindsToFlip = ((liab <= t) & (phe>0.5)) | ((liab > t) & (phe<0.5))\n\tliab[indsToFlip] = stats.norm(0,1).isf(prev)\n\t\n\tif (outFile is not None):\n\t\t#save liabilities\n\t\tf = open(outFile+\'.liabs\', \'w\')\n\t\tfor ind_i,[fid,iid] in enumerate(bed.iid): f.write(\' \'.join([fid, iid, \'%0.3f\'%liab[ind_i]]) + \'\n\')\t\t\n\t\tf.close()\n\n\t\t#save liabilities after regressing out the fixed effects\n\t\tif (numFixedFeatures > 0):\n\t\t\tliab_nofixed = liab - G[:, :numFixedFeatures].dot(beta[:numFixedFeatures])\n\t\t\tf = open(outFile+\'.liab_nofixed\', \'w\')\n\t\t\tfor ind_i,[fid,iid] in enumerate(bed.iid): f.write(\' \'.join([fid, iid, \'%0.3f\'%liab_nofixed[ind_i]]) + \'\n\')\t\t\n\t\t\tf.close()\n\t\t\t\n\t\t\tliab_nofixed2 = meanLiab - G[:, :numFixedFeatures].dot(beta[:numFixedFeatures])\n\t\t\tindsToFlip = ((liab_nofixed2 <= t) & (phe>0.5)) | ((liab_nofixed2 > t) & (phe<0.5))\n\t\t\tliab_nofixed2[indsToFlip] = stats.norm(0,1).isf(prev)\n\t\t\tf = open(outFile+\'.liab_nofixed2\', \'w\')\n\t\t\tfor ind_i,[fid,iid] in enumerate(bed.iid): f.write(\' \'.join([fid, iid, \'%0.3f\'%liab_nofixed2[ind_i]]) + \'\n\')\t\t\n\t\t\tf.close()\t\n\t\t\t\n\t#Return phenotype struct with liabilities\n\tliabsStruct = {\n\t\t\'header\':[None],\n\t\t\'vals\':liab,\n\t\t\'iid\':bed.iid\n\t}\n\treturn liabsStruct\n'



In [ ]:

$5$. Test for associations



In [13]:

    
# Paper uses fastlmm.association.single_snp function
# Dependent on Python 2.7, will attempt statsmodel lmm



In [14]:

    
# Read in estimated liabilities
liabs = pd.read_csv("dataset1.phe.liab", sep=' ', header=None, engine='c')
liabs.columns = ['fam', 'person', 'liab']
liabs.set_index(keys = 'person', inplace=True)
liabs = liabs.ix[keepArr]
liabs.iloc[:5,:5]



In [30]:

    
# Merge liabilities with snps
snps_estliabs = pd.concat([liabs, df], axis = 1)
snps_estliabs.iloc[:5,:5]









    Out[30]:






  
    
      
      fam
      liab
      (1, csnp18)
      (1, csnp35)
      (1, csnp59)
    
    
      person
      
      
      
      
      
    
  
  
    
      person1
      FAM1
      0.050725
      0.68
      -0.191
      -0.561
    
    
      person2
      FAM1
      2.250567
      -0.32
      -1.191
      0.439
    
    
      person3
      FAM1
      0.733357
      -1.32
      0.809
      0.439
    
    
      person4
      FAM1
      0.366351
      0.68
      -0.191
      0.439
    
    
      person5
      FAM1
      1.855133
      -1.32
      -0.191
      0.439



In [32]:

    
Y = snps_estliabs.ix[:,1]
snps = snps_estliabs.ix[:,2:]



In [31]:

    
! pip install git+https://github.com/nickFurlotte/pylmm









    



Collecting git+https://github.com/nickFurlotte/pylmm
  Cloning https://github.com/nickFurlotte/pylmm to /tmp/pip-dzk90wro-build
Installing collected packages: pyLMM
  Running setup.py install for pyLMM ... - \ | / - done
Successfully installed pyLMM-0.99



In [34]:

    
from pylmm import lmm
TS,PS = lmm.GWAS(Y, snps, cov, REML = True, refit = True)









    



  File "/opt/conda/lib/python3.4/site-packages/pylmm/lmm.py", line 64
    mn = W[True - np.isnan(W[:,i]),i].mean()
                                            ^
TabError: inconsistent use of tabs and spaces in indentation

Run through LEAP pipeline for each chromosome (parts 2-5)



In [ ]:

chromosome	1
snp	csnp18	csnp35	csnp59	csnp78	csnp85
person1	2	1	1	2	1
person2	1	0	2	2	2
person3	0	2	2	2	2
person4	2	1	2	2	1
person5	0	1	2	1	2

	fam	pheno
person
person1	FAM1	0
person2	FAM1	0
person3	FAM1	0
person4	FAM1	0
person5	FAM1	0