In [ ]:
##Author: Gene Burinskiy

!pip install plinkio
#!pip install h5py --for some reason, h5py doesn't install :/
#!pip install tables --since h5py can't be installed, neither can tables

In [1]:
import os
import re
import numpy as np
import pandas as pd
from plinkio import plinkfile
os.getcwd()


Out[1]:
'/home/jovyan/work/STA-663-Final-Project/data'

In [2]:
#working with original dataset
data_path = '/home/jovyan/work/LEAP/leap/regression/dataset1'
os.chdir(data_path)
os.listdir()


Out[2]:
['extracts',
 'dataset1.db',
 'dataset1-Copy1.bed',
 'dataset1.bed',
 'dataset1.cov',
 '.pversion',
 'dataset1.bim',
 'dataset1.fam',
 'dataset1.phe.liab',
 'dataset1.phe']

In [3]:
bed = plinkfile.open("dataset1")

In [4]:
loci = bed.get_loci()
len(loci)


Out[4]:
10499

In [13]:
locus = loci[0]

In [14]:
locus.name


Out[14]:
'csnp18'

In [15]:
locus.chromosome


Out[15]:
1

In [16]:
np.unique([x.chromosome for x in loci])


Out[16]:
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [5]:
samples = bed.get_samples()
print("Object of type: ",type(samples), "and length:" ,len(samples))


Object of type:  <class 'list'> and length: 1000

In [6]:
sample = samples[500]

In [7]:
print(sample.fid, sample.father_iid, sample.iid, sample.phenotype, sample.sex)


FAM1 0 person501 3.591670036315918 1

In [8]:
h = [row for row in bed][0]

In [26]:
phenos = pd.read_csv("dataset1.phe", sep=' ', header=0, names=['fam', 'person', 'pheno'], skiprows=0)
phenos.iloc[:5]


Out[26]:
fam person pheno
0 FAM1 person2 0
1 FAM1 person3 0
2 FAM1 person4 0
3 FAM1 person5 0
4 FAM1 person6 0

In [ ]:
len([x for x in h])

In [ ]:
##each row in bed is of length 10000 and there are 10,499 rows thus to make a matrix:
mat = np.zeros((10499,1000), dtype='int16') #1/4 of the taken up space

i=0
for row in bed:
    mat[i,:] = np.array([snp for snp in row])
    i+=1

In [ ]:
#this matrix is equivalent to transposed bed.val
print("Data type:", mat.dtype)
print(mat[:2,:5])
print("Size of bed matrix: %4.0fmb" %(mat.nbytes/(1024**2)))

In [ ]:
df = pd.DataFrame(mat.transpose()) #normally, it reads them in as floats which is a huge waste of space
df.columns = [x.name for x in loci]
df.index = [x.iid for x in bed.get_samples()] #could also double index on chromosomes
df.iloc[:5,:5]

In [ ]:
np.unique(df.dtypes)

In [ ]:
df = df.astype('float32')-df.astype('float32').mean() #this gets us pretty close to their normalization stuff
df.iloc[:5,:5]

In [ ]:
##Save the file to sql db - not feasible for this data
#from sqlalchemy import create_engine

#engine = create_engine('sqlite:///dataset1.db', echo=False)
#df.transpose().to_sql(name='dataset1', con=engine, if_exists = 'replace', index=True)

In [ ]:
%%timeit
np.cov(df)

In [ ]:
cov = np.cov(df)
print("Shave of covariance matrix:", cov.shape)
cov[:5,:5]

In [ ]:
"""
A straightforward solution of the optimization problem presented above is difficult owing to high dimensionality, which is equal to the number of genotyped
variants. Fortunately, the problem can be reformulated as a lower-dimensional problem, with dimensionality equal to the number of individuals.
The equivalence stems from the fact that the genotypes
matrix X can be represented in terms of the eigenvectors of its covariance
"""

In [ ]:
%%timeit
U,s,V = np.linalg.svd(df,full_matrices=False )

In [ ]:
import scipy

In [ ]:
%%timeit
U,s,V, = scipy.linalg.svd(df, full_matrices=False)

In [ ]:
%%timeit

U,s,V = scipy.linalg.svd(df, full_matrices=False)

In [ ]:
"""
Dependencies don't quite exist. 
from pandas import HDFStore

hdf = HDFStore('dataset1.h5')
# put the dataset in the storage
hdf.put('dataset1', df, format='table', data_columns=True)

hdf.append('d1', DataFrame(np.random.rand(5,3), 
           columns=('A','B','C')), 
           format='table', data_columns=True)
hdf.close() # closes the file

hdf = read_hdf('storage.h5', 'd1',
               where=['A>.5'], columns=['A','B'])
"""

In [ ]:
U,s,V = scipy.linalg.svd(df, full_matrices=False)
print("Shapes of U,s,and V respectively")
print(U.shape, s.shape, V.shape)

In [ ]: