In [70]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import sys,os
sys.path.insert(1,'/home/arya/workspace/bio')
import Utils.Util as utl
import pandas as pd
pd.options.display.max_rows = 20;
pd.options.display.expand_frame_repr = True
from IPython.display import display
import seaborn as sns

import Utils.Util as utl
import Utils.Plots as pplt
import Scripts.KyrgysHAPH.Plot as kplt
import Scripts.KyrgysHAPH.Util as kutl
diff=lambda a: a['No-HAPH']-a['HAPH']
filterGap=lambda a: a.groupby(level=0,axis=1).apply(lambda x: utl.filterGap2(x,pad=x.name/2*1000)).T.reset_index(level=0,drop=True).T

# none=pd.Series(None,index=a.index.levels[3])
# filterSNPN=lambda a: a.groupby(level=0).apply(lambda b: b.loc[b.name].stack(level=[0,1]).reorder_levels([0,1,3,4,2]).groupby(level=range(4)).apply(lambda x: (none,x.loc[x.name])[x.loc[x.name].m>b.name/5]).unstack(level=[2,3]))
scanXPSFS=diff(pd.read_pickle(kutl.path+'scan/XP.SFS.df')['No-HAPH vs HAPH'])
scanXPSFS= filterGap(scanXPSFS.unstack(level=[0,3]))
# xp=filterGap(diff(pd.read_pickle(kutl.path+'scan/XP.SFS.df')['No-HAPH vs HAPH']))
regions=[]

In [71]:
method='D'
print method + 'for No-HAPH vs HAPH'
a=-scanXPSFS.xs(method,level=1,axis=1)[50]
pplt.Manhattan(a,top_k=100);
o=a.sort_values(ascending=False).iloc[:100]
o=utl.BED.getIntervals(o,padding=50000).sort_values('score',ascending=False).reset_index()
print 'Top 100 regions {} intervals'.format(o.shape[0])
display(o)
kplt.plotSFS(interval=o.iloc[0])
regions+=[pd.concat([o],keys=[method])]


Dfor No-HAPH vs HAPH
Top 100 regions 35 intervals
CHROM start end score len
0 4 77205000 77395000 104.961 190000
1 2 225105000 225245000 64.789 140000
2 3 164345000 164475000 57.759 130000
3 19 54705000 54835000 56.732 130000
4 18 53895000 54025000 51.109 130000
5 11 39055000 39195000 50.707 140000
6 8 3345000 3465000 49.669 120000
7 14 46295000 46425000 49.320 130000
8 6 33005000 33125000 48.452 120000
9 1 219675000 219805000 48.155 130000
... ... ... ... ... ...
25 1 15125000 15255000 41.940 130000
26 4 132915000 133025000 41.389 110000
27 6 138325000 138435000 41.368 110000
28 4 95295000 95395000 40.256 100000
29 16 79735000 79835000 39.865 100000
30 2 81995000 82095000 39.748 100000
31 4 161025000 161125000 38.372 100000
32 5 66335000 66435000 38.149 100000
33 10 66385000 66485000 38.075 100000
34 7 4005000 4105000 38.022 100000

35 rows × 5 columns

4.0:77205K-77395K   (190Kb)

In [72]:
method='H'
print method + 'for No-HAPH vs HAPH'
a=-scanXPSFS.xs(method,level=1,axis=1)[50]
pplt.Manhattan(a,top_k=100);
o=a.sort_values(ascending=False).iloc[:100]
o=utl.BED.getIntervals(o,padding=50000).sort_values('score',ascending=False).reset_index()
print 'Top 100 regions {} intervals'.format(o.shape[0])
display(o)
kplt.plotSFS(interval=o.iloc[0])
regions+=[pd.concat([o],keys=[method])]


Hfor No-HAPH vs HAPH
Top 100 regions 35 intervals
CHROM start end score len
0 4 77215000 77395000 208.165 180000
1 2 225105000 225245000 148.129 140000
2 18 53885000 54025000 109.313 140000
3 7 144075000 144205000 107.471 130000
4 6 32995000 33125000 102.155 130000
5 14 46295000 46425000 99.487 130000
6 3 164345000 164475000 97.412 130000
7 11 39055000 39195000 97.073 140000
8 6 30085000 30195000 94.278 110000
9 8 3345000 3455000 93.409 110000
... ... ... ... ... ...
25 2 81995000 82115000 82.969 120000
26 1 15145000 15255000 82.715 110000
27 8 3835000 3935000 82.567 100000
28 6 138325000 138435000 82.419 110000
29 1 183595000 183695000 81.296 100000
30 5 29995000 30105000 79.454 110000
31 4 132915000 133025000 79.179 110000
32 7 4005000 4105000 78.586 100000
33 15 99785000 99885000 77.231 100000
34 18 57775000 57875000 77.125 100000

35 rows × 5 columns

4.0:77215K-77395K   (180Kb)

In [73]:
method='SFSelect'
print method + 'for No-HAPH vs HAPH'
a=scanXPSFS.xs(method,level=1,axis=1)[50]
pplt.Manhattan(a,top_k=100);
o=a.sort_values(ascending=False).iloc[:100]
o=utl.BED.getIntervals(o,padding=50000).sort_values('score',ascending=False).reset_index()
print 'Top 100 regions {} intervals'.format(o.shape[0])
display(o)
kplt.plotSFS(interval=o.iloc[0])
regions+=[pd.concat([o],keys=[method])]


SFSelectfor No-HAPH vs HAPH
Top 100 regions 45 intervals
CHROM start end score len
0 21 44075000 44185000 5.491 110000
1 11 55345000 55445000 5.165 100000
2 6 94945000 95045000 5.165 100000
3 X 89435000 89545000 5.165 110000
4 5 69615000 69925000 5.051 310000
5 2 91705000 91805000 4.867 100000
6 5 70845000 71035000 4.867 190000
7 12 9485000 9585000 4.867 100000
8 X 47965000 48065000 4.545 100000
9 X 66825000 66925000 4.545 100000
... ... ... ... ... ...
35 16 30175000 30275000 3.593 100000
36 X 67685000 67785000 3.457 100000
37 X 128175000 128275000 3.422 100000
38 9 64545000 64675000 3.335 130000
39 5 70115000 70215000 3.278 100000
40 21 43065000 43175000 3.237 110000
41 X 141035000 141135000 3.213 100000
42 15 30145000 30245000 3.213 100000
43 X 109855000 109955000 3.208 100000
44 19 42495000 42595000 3.177 100000

45 rows × 5 columns

21:44075K-44185K   (110Kb)

In [78]:
pd.read_pickle(kutl.path+'scan/XP.SFS.df')


Out[78]:
No-HAPH vs HAPH No-HAPH vs Sick Healthy vs Sick Normo vs Hyper
HAPH No-HAPH No-HAPH Sick Healthy Sick Hyper Normo
CHROM POS level_1
50 1 2415000 D -0.600442 -0.632207 -0.896802 -0.711543 -0.813691 -0.934368 -0.823001 -0.513089
H 0.489706 0.426176 0.451985 0.466452 0.546429 0.491613 0.385000 0.558529
SFSelect 3.460436 1.682556 2.182970 3.934019 4.262200 4.262200 3.460436 1.904135
m 4.000000 4.000000 5.000000 5.000000 6.000000 6.000000 4.000000 4.000000
pi 0.562500 0.530735 0.556875 0.530000 0.633214 0.555484 0.434808 0.649853
w 1.162942 1.162942 1.453677 1.241543 1.446905 1.489852 1.257809 1.162942
2425000 D -3.108542 -2.685012 -3.163248 -3.038287 -2.338411 -4.226203 -6.872399 -0.314129
H 4.846765 5.482059 5.582647 2.449032 6.333429 2.743226 -0.710769 8.428235
SFSelect 0.178544 0.107247 0.110003 0.253535 0.098821 0.277799 0.791776 0.052623
m 61.000000 61.000000 63.000000 63.000000 69.000000 69.000000 56.000000 56.000000
... ... ... ... ... ... ... ... ... ... ... ...
500 Y 21350000 H NaN NaN NaN NaN 0.000000 0.000000 0.000000 0.000000
m NaN NaN NaN NaN 0.000000 0.000000 0.000000 0.000000
pi NaN NaN NaN NaN 0.000000 0.000000 0.000000 0.000000
w NaN NaN NaN NaN 0.000000 0.000000 0.000000 0.000000
21450000 D NaN NaN NaN NaN -0.241230 -0.445650 -0.575731 -0.158603
H NaN NaN NaN NaN 0.225000 0.050323 0.052500 0.369265
SFSelect NaN NaN NaN NaN 5.599240 5.599240 5.599240 1.054166
m NaN NaN NaN NaN 2.000000 2.000000 2.000000 2.000000
pi NaN NaN NaN NaN 0.241071 0.050968 0.053173 0.422868
w NaN NaN NaN NaN 0.482302 0.496617 0.628904 0.581471

3265383 rows × 8 columns