In [3]:
%matplotlib inline
import matplotlib as mpl
mpl.use('agg')
import sys;sys.path.insert(1,'/home/arya/workspace/bio/')
import os
import numpy as np
import matplotlib.pyplot as plt
import sys,os
path='/'.join(os.getcwd().split('/')[:-4])
sys.path.insert(1,path)
import Utils.Util as utl
import Utils.Plots as pplt
import pandas as pd
pd.options.display.max_rows = 20;
pd.options.display.expand_frame_repr = True
from IPython.display import display
import seaborn as sns
import Scripts.KyrgysHAPH.Util as kutl
import Scripts.KyrgysHAPH.Plot as kplt
import Scripts.HLI.Kyrgyz.IBSScan.IBDScan as ibd
import Scripts.HLI.Kyrgyz.PBS as pbs
from glob import glob
pd.options.display.max_colwidth = 2000;
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 100 
path='/home/arya/storage/Data/Human/scan/selscan/' 
pathb='/home/arya/storage/Data/Human/CDTS/percentile/quantile/hg19/'
a=pd.read_pickle(path+'EUR.df')[('ihs','EUR','NA')].dropna().rename('EU')
a=((a.rank()/a.size) *100).apply(np.ceil).astype(int) 
one= lambda x: pd.read_csv(pathb+x,sep='\t',header=None,names=['CHROM','start','end']).applymap(utl.INT)
pad=1000;I.start-=pad;I.end+=pad
I=pd.concat(map(one,['AFR.low.EU.high.bed','AFR.low.EU.high.bed']),keys=[0,1]);I['len']=I.end-I.start
SFS=pd.read_pickle('/home/arya/Kyrgyz/scan/SFS/XP0.df').loc[50]
load=lambda x: pd.read_pickle(path+'{}.df'.format(x)).apply(lambda a: ((a.dropna().rank()/a.dropna().size) *100).apply(np.ceil).astype(int) )
d=SFS.xs('D',0,2).apply(lambda a: ((a.rank()/a.size) *100).apply(np.ceil).astype(int) )
eu=load('EUR')
af=load('AFR')

In [15]:
SFS.xs('m',0,2)[['AFR','EUR']]


Out[15]:
AFR EUR
CHROM POS
1 25000 70.0 30.0
35000 123.0 66.0
45000 129.0 63.0
55000 127.0 63.0
65000 123.0 61.0
75000 103.0 54.0
85000 50.0 18.0
95000 0.0 0.0
105000 0.0 0.0
115000 0.0 0.0
... ... ... ...
22 51125000 709.0 374.0
51135000 712.0 384.0
51145000 737.0 404.0
51155000 749.0 406.0
51165000 801.0 425.0
51175000 778.0 459.0
51185000 750.0 416.0
51195000 803.0 431.0
51205000 673.0 364.0
51215000 491.0 262.0

279260 rows × 2 columns


In [14]:
(SFS.xs('m',0,2)[['AFR','EUR']].rank()/SFS.shape[0]*100).apply(np.ceil).astype(int) 
# .drop_duplicates().plot.scatter(x='AFR',y='EUR')


Out[14]:
AFR EUR
CHROM POS
1 25000 2 1
35000 2 2
45000 2 2
55000 2 2
65000 2 2
75000 2 2
85000 1 1
95000 1 1
105000 1 1
115000 1 1
... ... ... ...
22 51125000 10 8
51135000 10 9
51145000 12 10
51155000 12 11
51165000 14 12
51175000 13 14
51185000 12 11
51195000 14 12
51205000 8 7
51215000 2 2

279260 rows × 2 columns


In [10]:
SFS.xs('D',0,2)[['AFR','EUR']].drop_duplicates().plot.scatter(x='AFR',y='EUR')


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa428a12ed0>

In [236]:
for stat in SFS.index.levels[2]:
    break
    d=SFS.xs(stat,0,2)[['AFR','EUR']].dropna().apply(lambda a: ((a.dropna().rank()/a.dropna().size) *100).apply(np.ceil).astype(int) )
    x=pd.concat([d.loc[i.CHROM].loc[utl.roundto(i.start-5000,10000)+5000].rename(x) for x,i in I.iterrows()],1).T
    d=d.applymap(lambda x: utl.roundto(x,5));d['a']=1
    d=d.set_index(['AFR','EUR']).groupby(level=[0,1]).sum().unstack('AFR').sort_index(ascending=False)['a']
    fig,ax=plt.subplots(1,2,figsize=(12,5))
    d[d==d.max().max()]=None
    sns.heatmap(d.applymap(np.log),ax=ax[0])
    x.loc[0].plot.scatter(x='AFR',y='EUR',c='r',ax=ax[1],label='AFR=1, EUR=100 (CDTS %ile)');
    x.loc[1].plot.scatter(x='AFR',y='EUR',c='b',ax=ax[1],label='AFR=100, EUR=1');
    plt.xlim([0,102]);plt.ylim([0,102])
    plt.suptitle(stat)



In [38]:
a=pd.read_csv('/home/arya/a.tsv',sep='\t',header=None,names=['CHROM','POS','CDTS']).set_index(['CHROM','POS']).CDTS
b=utl.scanGenome(a,winSize=1000,nsteps=5)
pplt.Manhattan(a);
pplt.Manhattan(b);


Chromosome 1

In [41]:
c=utl.scanGenome(a,winSize=2000,nsteps=5)
pplt.Manhattan(c);


Chromosome 1