notebook.community

Edit and run



In [ ]:

    
##Author: Gene Burinskiy

!pip install plinkio
#!pip install h5py --for some reason, h5py doesn't install :/
#!pip install tables --since h5py can't be installed, neither can tables



In [1]:

    
import os
import re
import numpy as np
import pandas as pd
from plinkio import plinkfile
os.getcwd()









    Out[1]:





'/home/jovyan/work/STA-663-Final-Project/data'



In [2]:

    
#working with original dataset
data_path = '/home/jovyan/work/LEAP/leap/regression/dataset1'
os.chdir(data_path)
os.listdir()









    Out[2]:





['extracts',
 'dataset1.db',
 'dataset1-Copy1.bed',
 'dataset1.bed',
 'dataset1.cov',
 '.pversion',
 'dataset1.bim',
 'dataset1.fam',
 'dataset1.phe.liab',
 'dataset1.phe']



In [3]:

    
bed = plinkfile.open("dataset1")



In [4]:

    
loci = bed.get_loci()
len(loci)









    Out[4]:





10499



In [13]:

    
locus = loci[0]



In [14]:

    
locus.name









    Out[14]:





'csnp18'



In [15]:

    
locus.chromosome









    Out[15]:





1



In [16]:

    
np.unique([x.chromosome for x in loci])









    Out[16]:





array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])



In [5]:

    
samples = bed.get_samples()
print("Object of type: ",type(samples), "and length:" ,len(samples))









    



Object of type:  <class 'list'> and length: 1000



In [6]:

    
sample = samples[500]



In [7]:

    
print(sample.fid, sample.father_iid, sample.iid, sample.phenotype, sample.sex)









    



FAM1 0 person501 3.591670036315918 1



In [8]:

    
h = [row for row in bed][0]



In [26]:

    
phenos = pd.read_csv("dataset1.phe", sep=' ', header=0, names=['fam', 'person', 'pheno'], skiprows=0)
phenos.iloc[:5]



In [ ]:

    
len([x for x in h])



In [ ]:

    
##each row in bed is of length 10000 and there are 10,499 rows thus to make a matrix:
mat = np.zeros((10499,1000), dtype='int16') #1/4 of the taken up space

i=0
for row in bed:
    mat[i,:] = np.array([snp for snp in row])
    i+=1



In [ ]:

    
#this matrix is equivalent to transposed bed.val
print("Data type:", mat.dtype)
print(mat[:2,:5])
print("Size of bed matrix: %4.0fmb" %(mat.nbytes/(1024**2)))



In [ ]:

    
df = pd.DataFrame(mat.transpose()) #normally, it reads them in as floats which is a huge waste of space
df.columns = [x.name for x in loci]
df.index = [x.iid for x in bed.get_samples()] #could also double index on chromosomes
df.iloc[:5,:5]



In [ ]:

    
np.unique(df.dtypes)



In [ ]:

    
df = df.astype('float32')-df.astype('float32').mean() #this gets us pretty close to their normalization stuff
df.iloc[:5,:5]



In [ ]:

    
##Save the file to sql db - not feasible for this data
#from sqlalchemy import create_engine

#engine = create_engine('sqlite:///dataset1.db', echo=False)
#df.transpose().to_sql(name='dataset1', con=engine, if_exists = 'replace', index=True)



In [ ]:

    
%%timeit
np.cov(df)



In [ ]:

    
cov = np.cov(df)
print("Shave of covariance matrix:", cov.shape)
cov[:5,:5]



In [ ]:

    
"""
A straightforward solution of the optimization problem presented above is difficult owing to high dimensionality, which is equal to the number of genotyped
variants. Fortunately, the problem can be reformulated as a lower-dimensional problem, with dimensionality equal to the number of individuals.
The equivalence stems from the fact that the genotypes
matrix X can be represented in terms of the eigenvectors of its covariance
"""



In [ ]:

    
%%timeit
U,s,V = np.linalg.svd(df,full_matrices=False )



In [ ]:

    
import scipy



In [ ]:

    
%%timeit
U,s,V, = scipy.linalg.svd(df, full_matrices=False)



In [ ]:

    
%%timeit

U,s,V = scipy.linalg.svd(df, full_matrices=False)



In [ ]:

    
"""
Dependencies don't quite exist. 
from pandas import HDFStore

hdf = HDFStore('dataset1.h5')
# put the dataset in the storage
hdf.put('dataset1', df, format='table', data_columns=True)

hdf.append('d1', DataFrame(np.random.rand(5,3), 
           columns=('A','B','C')), 
           format='table', data_columns=True)
hdf.close() # closes the file

hdf = read_hdf('storage.h5', 'd1',
               where=['A>.5'], columns=['A','B'])
"""



In [ ]:

    
U,s,V = scipy.linalg.svd(df, full_matrices=False)
print("Shapes of U,s,and V respectively")
print(U.shape, s.shape, V.shape)



In [ ]:

	fam	person
0	FAM1	person2
1	FAM1	person3
2	FAM1	person4
3	FAM1	person5
4	FAM1	person6