This demo illustrates how mtSet can be used for multi and single-trait set tests We consider a dataset of 192 samples and 3 flowering time phenotypes in A.thaliana from Atwell et al 2010 (Nature) Phenotypes were quantile-normalized to a gaussian distribution beforehand

Here we consider 3 different models

  • mtSet: multi-trait analysis where relatedness is accounted for as a random effect
  • stSet: single-trait analysis where relatedness is accounted for as a random effect
  • mtSet1VC: multi-trait analysis where relatedness is account for with the first 5 principal components of the kinship as fixed effects

Setup environment


In [1]:
import ipdb
import sys
import limix
sys.path.append('./..')

import mtSet.pycore.modules.splitter as SPLIT
import mtSet.pycore.modules.multiTraitSetTest as MTST
import mtSet.pycore.modules.chi2mixture as C2M
from mtSet.pycore.utils.utils import smartAppend
from mtSet.pycore.utils.utils import smartDumpDictHdf5

import scipy as SP
import h5py
import pylab as PL
import copy
import os
import cPickle
import time as TIME

In [2]:
files = {}
files['data_dir'] = 'data'
files['data'] = 'data/arab107_preprocessed.hdf5'
files['data_url'] = 'http://www.ebi.ac.uk/~casale/arab107_preprocessed.hdf5'
files['out_file'] = 'data/results.hdf5'
files['split_cache'] = 'windows_split.hdf5'
files['mtSet_null_cache'] = 'mtSet_null_cache.hdf5'
files['stSet_null_cache'] = 'stSet_null_cache.hdf5'
files['mtSet1VC_null_cache'] = 'mtSet1VC_null_cache.hdf5'

Download data file if not there


In [1]:
if not os.path.exists(files['data']):
    if not os.path.exists(files['data_dir']):
        os.makedirs(files['data_dir'])
    print "file not found, downloading from %s" % files['data_url']
    testfile=urllib.URLopener()
    testfile.retrieve(files['data_url'],files['data'])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-3b4965eda18d> in <module>()
----> 1 if not os.path.exists(files['data']):
      2     if not os.path.exists(files['data_dir']):
      3         os.makedirs(files['data_dir'])
      4     print "file not found, downloading from %s" % files['data_url']
      5     testfile=urllib.URLopener()

NameError: name 'os' is not defined

Settings for splitting the genome in different regions and permutations


In [3]:
settings = {}
settings['window_size']    = 1e4
settings['minNumberSnps']  = 4
settings['n_windows']      = 10
settings['n_permutations'] = 10

Import data

N = number of samples P = number of phenotypes V = number of variants K = number of covariates


In [4]:
f  = h5py.File(files['data'],'r')
phenotype   = f['phenotype'][:]   # phenotype matrix (NxP)
phenotypeID = f['phenotypeID'][:] # phenotype ids (P-vector)
genotype    = f['genotype']       # genotype matrix (NxV)
relatedness = f['relatedness'][:] # relatedness matrix (NxN)
geno_pos    = f['geno_pos'][:]    # genotype positions (V-vector)
geno_chrom  = f['geno_chrom'][:]  # genotype choromosomes (V-vector)
covariates  = f['covariates'][:]  # covariate matrix (NxK)

Analysis with mtSet

Here we consider no covariates for mtSet and stSet while we consider 6 covariates for mtSet1VC (intercept term and first 5 pcs of the relatedness matrix)

Multi trait set test class


In [5]:
mtSet    = MTST.MultiTraitSetTest(phenotype,relatedness)
mtSet1VC = MTST.MultiTraitSetTest(phenotype,F=covariates)

Fit null models


In [6]:
mtSet_null_info   = mtSet.fitNull(cache=True,fname=files['mtSet_null_cache'],rewrite=True)
stSet_null_info   = mtSet.fitNullTraitByTrait(cache=True,fname=files['stSet_null_cache'],rewrite=True)
mtSet1VC_null_info = mtSet1VC.fitNull(cache=True,fname=files['mtSet1VC_null_cache'],rewrite=True)

Precompute genotype windows


In [7]:
split = SPLIT.Splitter(pos=geno_pos,chrom=geno_chrom)
split.splitGeno(size=settings['window_size'],minSnps=settings['minNumberSnps'],cache=True,fname=files['split_cache'])
nWindows = split.get_nWindows()

Set test scan


In [8]:
RV = {}
for window_idx in range(settings['n_windows']):

    print '\t.. window %d'%window_idx

    # consider genetic region
    Iregion, rv_windows = split.getWindow(window_idx)
    region = genotype[:,Iregion]

    # fit models
    rv_mtSet = mtSet.optimize(region)
    rv_stSet = mtSet.optimizeTraitByTrait(region)
    rv_mtSet1VC = mtSet1VC.optimize(region)

    # store LLR (log likelihood ratios) and window positions
    smartAppend(RV,'window_chromosome',rv_windows['chrom'][0])
    smartAppend(RV,'window_start',rv_windows['start'][0])
    smartAppend(RV,'window_end',rv_windows['end'][0])
    smartAppend(RV,'llr_mtSet',rv_mtSet1VC['LLR'][0])
    smartAppend(RV,'llr_stSet',SP.concatenate([rv_stSet[key]['LLR'] for key in rv_stSet.keys()]))
    smartAppend(RV,'llr_mtSet1VC',rv_mtSet1VC['LLR'][0])


	.. window 0
	.. window 1
	.. window 2
	.. window 3
	.. window 4
	.. window 5
	.. window 6
	.. window 7
	.. window 8
	.. window 9

Permutations


In [9]:
for permutation_i in range(settings['n_permutations']):
    print '.. permutation %d' % permutation_i

    # set seed and generate sample permutation
    SP.random.seed(permutation_i)
    permutation = SP.random.permutation(phenotype.shape[0])

    for window_idx in range(settings['n_windows']):

        print '\t.. window %d'%window_idx

        # consider genetic region and permute
        Iregion, rv_windows = split.getWindow(window_idx)
        region = genotype[:,Iregion]
        permuted_region = region[permutation,:]

        # fit models
        rv_mtSet = mtSet.optimize(permuted_region)
        rv_stSet = mtSet.optimizeTraitByTrait(permuted_region)
        rv_mtSet1VC = mtSet1VC.optimize(permuted_region)

        # store permutation LLRs
        smartAppend(RV,'permutation_llr_mtSet',rv_mtSet1VC['LLR'][0])
        smartAppend(RV,'permutation_llr_stSet',SP.concatenate([rv_stSet[key]['LLR'] for key in rv_stSet.keys()]))
        smartAppend(RV,'permutation_llr_mtSet1VC',rv_mtSet1VC['LLR'][0])


.. permutation 0
	.. window 0
	.. window 1
	.. window 2
	.. window 3
	.. window 4
	.. window 5
	.. window 6
	.. window 7
	.. window 8
	.. window 9
.. permutation 1
	.. window 0
	.. window 1
	.. window 2
	.. window 3
	.. window 4
	.. window 5
	.. window 6
	.. window 7
	.. window 8
	.. window 9
.. permutation 2
	.. window 0
	.. window 1
	.. window 2
	.. window 3
	.. window 4
	.. window 5
	.. window 6
	.. window 7
	.. window 8
	.. window 9
.. permutation 3
	.. window 0
	.. window 1
	.. window 2
	.. window 3
	.. window 4
	.. window 5
	.. window 6
	.. window 7
	.. window 8
	.. window 9
.. permutation 4
	.. window 0
	.. window 1
	.. window 2
	.. window 3
	.. window 4
	.. window 5
	.. window 6
	.. window 7
	.. window 8
	.. window 9
.. permutation 5
	.. window 0
	.. window 1
	.. window 2
	.. window 3
	.. window 4
	.. window 5
	.. window 6
	.. window 7
	.. window 8
	.. window 9
.. permutation 6
	.. window 0
	.. window 1
	.. window 2
	.. window 3
	.. window 4
	.. window 5
	.. window 6
	.. window 7
	.. window 8
	.. window 9
.. permutation 7
	.. window 0
	.. window 1
	.. window 2
	.. window 3
	.. window 4
	.. window 5
	.. window 6
	.. window 7
	.. window 8
	.. window 9
.. permutation 8
	.. window 0
	.. window 1
	.. window 2
	.. window 3
	.. window 4
	.. window 5
	.. window 6
	.. window 7
	.. window 8
	.. window 9
.. permutation 9
	.. window 0
	.. window 1
	.. window 2
	.. window 3
	.. window 4
	.. window 5
	.. window 6
	.. window 7
	.. window 8
	.. window 9

In [10]:
# vectorize outputs
for key in RV.keys():   RV[key] = SP.array(RV[key])

Calculate Pvals

For accurate estimation of pvalues either the number of windows or the number of permutations should be increased


In [11]:
c2m = C2M.Chi2mixture(tol=4e-3)

# obtain p-values for mtSet
c2m.estimate_chi2mixture(RV['permutation_llr_mtSet'])
RV['pv_mtSet'] = c2m.sf(RV['llr_mtSet'])
RV['permutation_pv_mtSet'] = c2m.sf(RV['permutation_llr_mtSet'])

# obtain p-values for stSet
RV['pv_stSet'] = SP.zeros_like(RV['llr_stSet'])
RV['permutation_pv_stSet'] = SP.zeros_like(RV['permutation_llr_stSet'])
for p in range(phenotype.shape[1]):
    c2m.estimate_chi2mixture(RV['permutation_llr_stSet'][:,p])
    RV['pv_stSet'][:,p] = c2m.sf(RV['llr_stSet'][:,p])
    RV['permutation_pv_stSet'][:,p] = c2m.sf(RV['permutation_llr_stSet'][:,p])

# obtain p-values for mtSet1VC
c2m.estimate_chi2mixture(RV['permutation_llr_mtSet1VC'])
RV['pv_mtSet1VC'] = c2m.sf(RV['llr_mtSet1VC'])
RV['permutation_pv_mtSet1VC'] = c2m.sf(RV['permutation_llr_mtSet1VC'])


/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/numpy/core/_methods.py:70: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)

Export results


In [12]:
fout = h5py.File(files['out_file'],'w')
smartDumpDictHdf5(RV,fout)
fout.close()

In [ ]: