In [1]:
# set some ipython notebook properties
%matplotlib inline
# set degree of verbosity (adapt to INFO for more verbose output)
import logging
logging.basicConfig(level=logging.WARNING)
# set figure sizes
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
In [2]:
import os
import numpy as np
from pysnptools.snpreader import Bed
snpreader = Bed("all.bed", count_A1=False)
# What is snpreader?
print snpreader
In [3]:
print snpreader.iid_count
print snpreader.sid_count
print snpreader.iid[:3]
print snpreader.sid
In [4]:
snpdata = snpreader.read()
#What is snpdata?
print snpdata
#What do the iids and sid of snprdata look like?
print snpdata.iid_count, snpdata.sid_count
print snpdata.iid[:3]
print snpdata.sid
In [5]:
print snpdata.val[:7,:7]
print np.mean(snpdata.val)
In [6]:
print np.mean(Bed("all.bed",count_A1=False).read().val)
In [7]:
from pysnptools.snpreader import SnpData
snpdata1 = SnpData(iid=[['f1','c1'],['f1','c2'],['f2','c1']],
sid=['snp1','snp2'],
val=[[0,1],[2,.5],[.5,np.nan]])
print np.nanmean(snpdata1.val)
see PowerPoint summary
In [8]:
snpreader = Bed("all.bed",count_A1=False)
snp0reader = snpreader[:,0]
print snp0reader
print snp0reader.iid_count, snp0reader.sid_count
print snp0reader.sid
In [9]:
print snpreader # Is not changed
In [10]:
snp0data = snp0reader.read()
print snp0data
print snp0data.iid_count, snp0data.sid_count
print snp0data.sid
print snp0data.val[:10,:]
In [11]:
print Bed("all.bed",count_A1=False)[9,:].read().val
In [12]:
snp55data = Bed("all.bed",count_A1=False)[:5,:5].read()
print snp55data
print snp55data.iid_count, snp55data.sid_count
print snp55data.iid
print snp55data.sid
print snp55data.val
In [13]:
snpreaderA = Bed("all.bed",count_A1=False) # read all
snpreaderB = snpreaderA[:,:250] #read first 250 sids
snpreaderC = snpreaderB[:10,:] # reader first 10 iids
snpreaderD = snpreaderC[::2,::2]
print snpreaderD
print snpreaderD.iid_count, snpreaderD.sid_count
print snpreaderD.iid
print snpreaderD.read().val[:10,:10] #only reads the final values desired (if practical)
In [14]:
# List of indexes (can use to reorder)
snpdata43210 = Bed("all.bed",count_A1=False)[[4,3,2,1,0],:].read()
print snpdata43210.iid
In [15]:
# List of booleans to select
snp43210B = snpdata43210[[False,True,True,False,False],:]
print snp43210B
print snp43210B.iid
In [16]:
print hasattr(snp43210B,'val')
Answer: No. It's a subset of a SnpData, so it will read from a SnpData, but it is not a SnpData. Use .read() to get the values.
In [17]:
snpdata4321B = snp43210B.read(view_ok=True) #view_ok means ok to share memory
print snpdata4321B.val
In [18]:
print Bed("all.bed",count_A1=False)[::-10,:].iid[:10]
see PowerPoint summary
In [19]:
print Bed("all.bed",count_A1=False).read().val.flags
In [20]:
snpdata32c = Bed("all.bed",count_A1=False).read(order='C',dtype=np.float32)
print snpdata32c.val.dtype
In [21]:
print snpdata32c.val.flags
In [22]:
print Bed("all.bed",count_A1=False).pos
In [23]:
snpreader = Bed("all.bed",count_A1=False)
print snpreader.pos[:,0]
In [24]:
chr5_bools = (snpreader.pos[:,0] == 5)
print chr5_bools
In [25]:
chr5reader = snpreader[:,chr5_bools]
print chr5reader
In [26]:
chr5data = chr5reader.read()
print chr5data.pos
In one line
In [27]:
chr5data = Bed("all.bed",count_A1=False)[:,snpreader.pos[:,0] == 5].read()
print chr5data.val
In [28]:
snpreader = Bed("all.bed",count_A1=False)
iid0 =[['cid499P1','cid499P1'],
['cid489P1','cid489P1'],
['cid479P1','cid479P1']]
indexes0 = snpreader.iid_to_index(iid0)
print indexes0
In [29]:
snpreader0 = snpreader[indexes0,:]
print snpreader0.iid
In [30]:
# more condensed
snpreader0 = snpreader[snpreader.iid_to_index(iid0),:]
print snpreader0.iid
In [31]:
snpdata0chr5 = snpreader[snpreader.iid_to_index(iid0),snpreader.pos[:,0] == 5].read()
print np.mean(snpdata0chr5.val)
see PowerPoint summary
In [32]:
from pysnptools.snpreader import Pheno
phenoreader = Pheno("pheno_10_causals.txt")
print phenoreader
print phenoreader.iid_count, phenoreader.sid_count
In [33]:
print phenoreader.sid
print phenoreader.pos
In [34]:
phenodata = phenoreader.read()
print phenodata.val[:10,:]
In [35]:
snpdata1010 = Bed("all.bed",count_A1=False)[:10,:10].read()
Pheno.write("deleteme1010.txt",snpdata1010)
print os.path.exists("deleteme1010.txt")
Write the snpdata to Bed format
In [36]:
Bed.write("deleteme1010.bed",snpdata1010)
print os.path.exists("deleteme1010.bim")
In [37]:
snpdata1 = SnpData(iid=[['f1','c1'],['f1','c2'],['f2','c1']],
sid=['snp1','snp2'],
val=[[0,1],[2,1],[1,np.nan]])
Bed.write("deleteme1.bed",snpdata1)
print os.path.exists("deleteme1.fam")
In [38]:
from pysnptools.snpreader import SnpNpz
SnpNpz.write("deleteme1010.snp.npz", snpdata1010)
print os.path.exists("deleteme1010.snp.npz")
Use SnpHdf5 for low-memory random-access reads, good speed and size, and compatiblity outside Python
In [39]:
from pysnptools.snpreader import SnpHdf5
SnpHdf5.write("deleteme1010.snp.hdf5", snpdata1010)
print os.path.exists("deleteme1010.snp.hdf5")
see PowerPoint summary
In [40]:
snpreader = Bed("all.bed",count_A1=False)
phenoreader = Pheno("pheno_10_causals.txt")[::-2,:] #half the iids, and in reverse order
print snpreader.iid_count, phenoreader.iid_count
print snpreader.iid[:5]
print phenoreader.iid[:5]
In [41]:
import pysnptools.util as pstutil
snpreader_i,phenoreader_i = pstutil.intersect_apply([snpreader,phenoreader])
print np.array_equal(snpreader_i.iid,phenoreader_i.iid)
In [42]:
snpdata_i = snpreader_i.read()
phenodata_i = phenoreader_i.read()
print np.array_equal(snpdata_i.iid,phenodata_i.iid)
print snpdata_i.val[:10,:]
print phenodata_i.val[:10,:]
In [43]:
weights = np.linalg.lstsq(snpdata_i.val, phenodata_i.val)[0] #usually would add a 1's column
predicted = snpdata_i.val.dot(weights)
import matplotlib.pyplot as plt
plt.plot(phenodata_i.val, predicted, '.', markersize=10)
plt.show() #Easy to 'predict' seen 250 cases with 5000 variables.
In [44]:
# How does it predict unseen cases?
phenoreader_unseen = Pheno("pheno_10_causals.txt")[-2::-2,:]
snpreader_u,phenoreader_u = pstutil.intersect_apply([snpreader,phenoreader_unseen])
snpdata_u = snpreader_u.read()
phenodata_u = phenoreader_u.read()
predicted_u = snpdata_u.val.dot(weights)
plt.plot(phenodata_u.val, predicted_u, '.', markersize=10)
plt.show() #Hard to predict unseen 250 cases with 5000 variables.
see PowerPoint summary
In [45]:
snpreader = Bed("all.bed",count_A1=False)
snpdata = snpreader.read()
snpdata = snpdata.standardize() #In place AND returns self
print snpdata.val[:,:5]
In [46]:
snpdata = Bed("all.bed",count_A1=False).read().standardize()
print snpdata.val[:,:5]
In [47]:
from pysnptools.standardizer import Beta
snpdataB = Bed("all.bed",count_A1=False).read().standardize(Beta(1,25))
print snpdataB.val[:,:4]
In [48]:
from pysnptools.standardizer import Unit
kerneldata = Bed("all.bed",count_A1=False).read_kernel(standardizer=Unit())
print kerneldata.val[:,:4]
In [49]:
kerneldata = Bed("all.bed",count_A1=False).read_kernel(standardizer=Unit(),block_size=500)
print kerneldata.val[:,:4]
see PowerPoint summary
In [50]:
from pysnptools.snpreader import Bed
pstreader = Bed("all.bed",count_A1=False)
print pstreader.row_count, pstreader.col_count
In [51]:
print pstreader.col_property
In [52]:
from pysnptools.pstreader import PstData
data1 = PstData(row=['a','b','c'],
col=['y','z'],
val=[[1,2],[3,4],[np.nan,6]],
row_property=['A','B','C'])
reader2 = data1[data1.row < 'c', data1.col_to_index(['z','y'])]
print reader2
In [53]:
print reader2.read().val
In [54]:
print reader2.row_property
In [55]:
print reader2.col_property.shape, reader2.col_property.dtype
In [56]:
from pysnptools.pstreader import PstNpz, PstHdf5
fnnpz = "delme.pst.npz"
PstNpz.write(fnnpz,data1)
data2 = PstNpz(fnnpz).read()
fnhdf5 = "delme.pst.hdf5"
PstHdf5.write(fnhdf5,data2)
data3 = PstHdf5(fnhdf5).read()
print data2, data3
In [57]:
print data2.val
print data3.val
see PowerPoint summary
In [58]:
from pysnptools.util import IntRangeSet
a = IntRangeSet("100:500,501:1000") # a is the set of integers from 100 to 500 (exclusive) and 501 to 1000 (exclusive)
b = IntRangeSet("-20,400:600") # b is the set of integers -20 and the range 400 to 600 (exclusive)
c = a | b # c is the union of a and b, namely -20 and 100 to 1000 (exclusive)
print c
print 200 in c
print -19 in c
In [59]:
from pysnptools.util import IntRangeSet
line = "chr15 29370 37380 29370,32358,36715 30817,32561,37380"
chr,trans_start,trans_last,exon_starts,exon_lasts = line.split() # split the line on white space
trans_start = int(trans_start)
trans_stop = int(trans_last) + 1 # add one to convert the inclusive "last" value into a Pythonesque exclusive "stop" value
int_range_set = IntRangeSet((trans_start,trans_stop)) # creates a IntRangeSet from 29370 (inclusive) to 37381 (exclusive)
print int_range_set # print at any time to see the current value
Parse the exon start and last lists from strings to lists of integers (converting ‘last’ to ‘stop’)
In [60]:
exon_starts = [int(start) for start in exon_starts.strip(",").split(',')]
exon_stops = [int(last)+1 for last in exon_lasts.strip(",").split(',')]
assert len(exon_starts) == len(exon_stops)
Zip together the two lists to create an iterable of exon_start,exon_stop tuples. Then ‘set subtract’ all these ranges from int_range_set.
In [61]:
from itertools import izip
int_range_set -= izip(exon_starts,exon_stops)
print int_range_set # See what it looks like
Create the desired output by iterating through each contiguous range of integers.
In [62]:
for start, stop in int_range_set.ranges():
print "{0} {1} {2}".format(chr, start, stop-1)
see PowerPoint summary
In [63]:
# import the algorithm
from fastlmm.association import single_snp_leave_out_one_chrom
from pysnptools.snpreader import Bed
# set up data
##############################
snps = Bed("all.bed",count_A1=False)
pheno_fn = "pheno_10_causals.txt"
cov_fn = "cov.txt"
# run gwas
###################################################################
results_df = single_snp_leave_out_one_chrom(snps, pheno_fn, covar=cov_fn)
# print head of results data frame
import pandas as pd
pd.set_option('display.width', 1000)
results_df.head(n=10)
Out[63]:
In [65]:
# manhattan plot
import pylab
import fastlmm.util.util as flutil
flutil.manhattan_plot(results_df[["Chr", "ChrPos", "PValue"]],pvalue_line=1e-5,xaxis_unit_bp=False)
pylab.show()
see PowerPoint summary