In [ ]:
##Author: Gene Burinskiy
!pip install plinkio
#!pip install h5py --for some reason, h5py doesn't install :/
#!pip install tables --since h5py can't be installed, neither can tables
In [1]:
import os
import re
import numpy as np
import pandas as pd
from plinkio import plinkfile
os.getcwd()
Out[1]:
In [2]:
#working with original dataset
data_path = '/home/jovyan/work/LEAP/leap/regression/dataset1'
os.chdir(data_path)
os.listdir()
Out[2]:
In [3]:
bed = plinkfile.open("dataset1")
In [4]:
loci = bed.get_loci()
len(loci)
Out[4]:
In [13]:
locus = loci[0]
In [14]:
locus.name
Out[14]:
In [15]:
locus.chromosome
Out[15]:
In [16]:
np.unique([x.chromosome for x in loci])
Out[16]:
In [5]:
samples = bed.get_samples()
print("Object of type: ",type(samples), "and length:" ,len(samples))
In [6]:
sample = samples[500]
In [7]:
print(sample.fid, sample.father_iid, sample.iid, sample.phenotype, sample.sex)
In [8]:
h = [row for row in bed][0]
In [26]:
phenos = pd.read_csv("dataset1.phe", sep=' ', header=0, names=['fam', 'person', 'pheno'], skiprows=0)
phenos.iloc[:5]
Out[26]:
In [ ]:
len([x for x in h])
In [ ]:
##each row in bed is of length 10000 and there are 10,499 rows thus to make a matrix:
mat = np.zeros((10499,1000), dtype='int16') #1/4 of the taken up space
i=0
for row in bed:
mat[i,:] = np.array([snp for snp in row])
i+=1
In [ ]:
#this matrix is equivalent to transposed bed.val
print("Data type:", mat.dtype)
print(mat[:2,:5])
print("Size of bed matrix: %4.0fmb" %(mat.nbytes/(1024**2)))
In [ ]:
df = pd.DataFrame(mat.transpose()) #normally, it reads them in as floats which is a huge waste of space
df.columns = [x.name for x in loci]
df.index = [x.iid for x in bed.get_samples()] #could also double index on chromosomes
df.iloc[:5,:5]
In [ ]:
np.unique(df.dtypes)
In [ ]:
df = df.astype('float32')-df.astype('float32').mean() #this gets us pretty close to their normalization stuff
df.iloc[:5,:5]
In [ ]:
##Save the file to sql db - not feasible for this data
#from sqlalchemy import create_engine
#engine = create_engine('sqlite:///dataset1.db', echo=False)
#df.transpose().to_sql(name='dataset1', con=engine, if_exists = 'replace', index=True)
In [ ]:
%%timeit
np.cov(df)
In [ ]:
cov = np.cov(df)
print("Shave of covariance matrix:", cov.shape)
cov[:5,:5]
In [ ]:
"""
A straightforward solution of the optimization problem presented above is difficult owing to high dimensionality, which is equal to the number of genotyped
variants. Fortunately, the problem can be reformulated as a lower-dimensional problem, with dimensionality equal to the number of individuals.
The equivalence stems from the fact that the genotypes
matrix X can be represented in terms of the eigenvectors of its covariance
"""
In [ ]:
%%timeit
U,s,V = np.linalg.svd(df,full_matrices=False )
In [ ]:
import scipy
In [ ]:
%%timeit
U,s,V, = scipy.linalg.svd(df, full_matrices=False)
In [ ]:
%%timeit
U,s,V = scipy.linalg.svd(df, full_matrices=False)
In [ ]:
"""
Dependencies don't quite exist.
from pandas import HDFStore
hdf = HDFStore('dataset1.h5')
# put the dataset in the storage
hdf.put('dataset1', df, format='table', data_columns=True)
hdf.append('d1', DataFrame(np.random.rand(5,3),
columns=('A','B','C')),
format='table', data_columns=True)
hdf.close() # closes the file
hdf = read_hdf('storage.h5', 'd1',
where=['A>.5'], columns=['A','B'])
"""
In [ ]:
U,s,V = scipy.linalg.svd(df, full_matrices=False)
print("Shapes of U,s,and V respectively")
print(U.shape, s.shape, V.shape)
In [ ]: