In [110]:
%matplotlib inline
import matplotlib as mpl
mpl.use('agg')
import os
# os.environ["DISPLAY"] = "localhost:11.0"
import numpy as np
import matplotlib.pyplot as plt
import sys,os
path='/'.join(os.getcwd().split('/')[:-4])
sys.path.insert(1,path)
import Utils.Util as utl
import Utils.Plots as pplt
import pandas as pd
pd.options.display.max_rows = 20;
pd.options.display.expand_frame_repr = True
from IPython.display import display
import pylab as plt
import seaborn as sns
import Scripts.KyrgysHAPH.Util as kutl
import Scripts.KyrgysHAPH.Plot as kplt
import Scripts.HLI.Kyrgyz.IBSScan.IBDScan as ibd
import Scripts.HLI.Kyrgyz.PBS as pbs
pd.options.display.max_colwidth = 2000;
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 100
sup=['AFR','EUR','EAS','SAS']
sub=['KGZ']+utl.VCF.loadPanel().set_index('super_pop').loc[['EAS','SAS']]['pop'].unique().tolist()[:-2]
eas=utl.VCF.loadPanel().set_index('super_pop').loc[['EAS']]['pop'].unique().tolist() +['KGZ'] +['YRI']
supk=['KGZ']+sup

In [108]:
A=pd.read_pickle('/home/arya/POP/KGZU+ALL/chr2.df').reset_index([2,3,4],drop=True)

In [111]:
sub


Out[111]:
['KGZ', 'CHS', 'CDX', 'KHV', 'CHB', 'JPT', 'PJL', 'BEB', 'STU']

In [109]:
alpha=0.
# a=A[supk][utl.polymorphix(A[sup].mean(1),1e-2/3,True)].T
# a=A[supk][A.KGZ>0.1].T
# a=A[supk][(A.AFR>0.1)&(A.EUR>0.1)&(A.EAS>0.1)&(A.SAS>0.1)&(A.KGZ>0.1)].T
a=A[eas][(A.EAS>alpha)&(A.SAS>alpha)&(A.KGZ>alpha)].T
c=pd.Series(sns.color_palette("hls",a.index.unique().size ), index= a.index.unique())
c['YRI']=(0,0,0)
plt.figure();ax=plt.gca()
for i in a.index:
    utl.pcaX(a).loc[[i]].plot.scatter(x=0,y=1,label=i,ax=ax,c=c[i],s=100)



In [ ]:
path='/home/arya/POP/{}/'.format(pop)
f='noSingleton/chr{}.vcf.gz.eigenvec'.format(CHROM)

panel=pd.read_csv(path+'panel',sep='\t').set_index('sample').iloc[:,:2]
# panel=pd.read_csv('/home/arya/Kyrgyz/info/kyrgyz.panel',sep='\t').set_index('sample')['pop']
# print panel
a=pd.read_csv(path+f,sep=' ',header=None).set_index(0).join(panel).set_index(['super_pop'])#.reorder_levels([1,0]).sort_index()
c=pd.Series(sns.color_palette("hls",a.index.unique().size ), index= a.index.unique())
plt.figure(figsize=(6,6),dpi=150);ax=plt.gca()
a.groupby(level=0).apply(lambda x: x.plot.scatter(x=2,y=3,label=x.name,ax=ax,c=c[x.name]));

In [23]:
pop,CHROM='HA',22
path='/home/arya/POP/{}/'.format(pop)
f='noSingleton/chr{}.vcf.gz.eigenvec'.format(CHROM)

panel=pd.read_csv(path+'panel',sep='\t').set_index('sample').iloc[:,:2]
# panel=pd.read_csv('/home/arya/Kyrgyz/info/kyrgyz.panel',sep='\t').set_index('sample')['pop']
# print panel
a=pd.read_csv(path+f,sep=' ',header=None).set_index(0).join(panel).set_index(['super_pop'])#.reorder_levels([1,0]).sort_index()
c=pd.Series(sns.color_palette("hls",a.index.unique().size ), index= a.index.unique())
plt.figure(figsize=(6,6),dpi=150);ax=plt.gca()
a.groupby(level=0).apply(lambda x: x.plot.scatter(x=2,y=3,label=x.name,ax=ax,c=c[x.name]));



In [58]:
pop,CHROM='KGZU',22
path='/home/arya/POP/{}/'.format(pop)
f='chr{}.vcf.gz.eigenvec'.format(CHROM)

panel=pd.read_csv(path+'panel',sep='\t').set_index('sample')
panel=pd.read_csv('/home/arya/Kyrgyz/info/kyrgyz.panel',sep='\t').set_index('sample')['pop']
a=pd.read_csv(path+f,sep=' ',header=None).set_index(0).join(panel).set_index(['pop'])#.reorder_levels([1,0]).sort_index()
# a=a.loc[['AFR','KGZ','EUR','EAS','SAS']]
c=pd.Series(sns.color_palette("hls",a.index.unique().size ), index= a.index.unique())

fig,ax=plt.subplots(1,1,figsize=(6,5),dpi=150)
x,y=2,3
# for ax ,(x,y) in zip(axes,[(2,3),(2,4),(3,4)][:1]):
a.groupby(level=0).apply(lambda X: X.plot.scatter(x=x,y=y,label=X.name,ax=ax,c=c[X.name],alpha=0.995));ax.set_xlabel('PC'+str(x-1));ax.set_ylabel('PC'+str(y-1))


Out[58]:
<matplotlib.text.Text at 0x7f9abf50fe50>

In [52]:
pop,CHROM='KGZU+ALL',2
path='/home/arya/POP/{}/'.format(pop)
f='chr{}.vcf.gz.eigenvec'.format(CHROM)

panel=pd.read_csv(path+'panel',sep='\t').set_index('sample').iloc[:,:2]
a=pd.read_csv(path+f,sep=' ',header=None).set_index(0).join(panel).replace({'super_pop':{'Healthy':'KGZ','Sick':'KGZ'}}).set_index(['super_pop'])#.reorder_levels([1,0]).sort_index()
a=a.loc[['AFR','KGZ','EUR','EAS','SAS']]
c=pd.Series(sns.color_palette("hls",a.index.unique().size ), index= a.index.unique())

fig,ax=plt.subplots(1,1,figsize=(6,5),dpi=150)
x,y=2,3
# for ax ,(x,y) in zip(axes,[(2,3),(2,4),(3,4)][:1]):
a.groupby(level=0).apply(lambda X: X.plot.scatter(x=x,y=y,label=X.name,ax=ax,c=c[X.name],alpha=0.995));ax.set_xlabel('PC'+str(x-1));ax.set_ylabel('PC'+str(y-1))


Out[52]:
<matplotlib.text.Text at 0x7f9abe802fd0>

In [79]:
pop,CHROM='HA',22
path='/home/arya/POP/{}/'.format(pop)
f='beagle/noSingleton/chr{}.vcf.gz.eigenvec'.format(CHROM)

panel=pd.read_csv(path+'panel',sep='\t').set_index('sample').iloc[:,:2]
# panel=pd.read_csv('/home/arya/Kyrgyz/info/kyrgyz.panel',sep='\t').set_index('sample')['pop']
# print panel
a=pd.read_csv(path+f,sep=' ',header=None).set_index(0).join(panel).set_index(['super_pop'])#.reorder_levels([1,0]).sort_index()
c=pd.Series(sns.color_palette("hls",a.index.unique().size ), index= a.index.unique())

fig,axes=plt.subplots(1,3,figsize=(18,5),dpi=150)
x,y=2,3
for ax ,(x,y) in zip(axes,[(2,3),(2,4),(3,4)]):
    a.groupby(level=0).apply(lambda X: X.plot.scatter(x=x,y=y,label=X.name,ax=ax,c=c[X.name],alpha=0.995));ax.set_xlabel('PC'+str(x-1));ax.set_ylabel('PC'+str(y-1))



In [68]:
pop,CHROM='HA',22
path='/home/arya/POP/{}/'.format(pop)
f='beagle/noSingleton/filtered/chr{}.vcf.gz.eigenvec'.format(CHROM)

panel=pd.read_csv(path+'panel',sep='\t').set_index('sample').iloc[:,:2]
# panel=pd.read_csv('/home/arya/Kyrgyz/info/kyrgyz.panel',sep='\t').set_index('sample')['pop']
# print panel
a=pd.read_csv(path+f,sep=' ',header=None).set_index(0).join(panel).set_index(['super_pop'])#.reorder_levels([1,0]).sort_index()
c=pd.Series(sns.color_palette("hls",a.index.unique().size ), index= a.index.unique())

fig,axes=plt.subplots(1,3,figsize=(18,5))
x,y=2,3
for ax ,(x,y) in zip(axes,[(2,3),(2,4),(3,4)]):
    a.groupby(level=0).apply(lambda X: X.plot.scatter(x=x,y=y,label=X.name,ax=ax,c=c[X.name],alpha=0.995));ax.set_xlabel('PC'+str(x-1));ax.set_ylabel('PC'+str(y-1))



In [80]:
pop,CHROM='HA',22
path='/home/arya/POP/{}/'.format(pop)
f='beagle/filtered/chr{}.vcf.gz.eigenvec'.format(CHROM)

panel=pd.read_csv(path+'panel',sep='\t').set_index('sample').iloc[:,:2]
# panel=pd.read_csv('/home/arya/Kyrgyz/info/kyrgyz.panel',sep='\t').set_index('sample')['pop']
# print panel
a=pd.read_csv(path+f,sep=' ',header=None).set_index(0).join(panel).set_index(['super_pop'])#.reorder_levels([1,0]).sort_index()
c=pd.Series(sns.color_palette("hls",a.index.unique().size ), index= a.index.unique())

fig,axes=plt.subplots(1,3,figsize=(18,5))
x,y=2,3
for ax ,(x,y) in zip(axes,[(2,3),(2,4),(3,4)]):
    a.groupby(level=0).apply(lambda X: X.plot.scatter(x=x,y=y,label=X.name,ax=ax,c=c[X.name],alpha=0.995));ax.set_xlabel('PC'+str(x-1));ax.set_ylabel('PC'+str(y-1))



In [120]:
path='/home/arya/POP/ASIA/merge/beagle/poly/'
f='chr22.vcf.gz.eigenvec'

panel=pd.read_csv(path+'panel',sep='\t').set_index('sample').iloc[:,:2]
a=pd.read_csv(path+f,sep=' ',header=None).set_index(0).join(panel).replace({'super_pop':{'Healthy':'KGZ','Sick':'KGZ'}})
.set_index(['super_pop'])#.reorder_levels([1,0]).sort_index()
a
# a.iloc[:,1:3].plot.scatter(x=x,y=y)
# a=a.loc[['AFR','KGZ','EUR','EAS','SAS']]
# c=pd.Series(sns.color_palette("hls",a.index.unique().size ), index= a.index.unique())
# a
# fig,ax=plt.subplots(1,1,figsize=(6,5),dpi=100)
# x,y=2,3
# # for ax ,(x,y) in zip(axes,[(2,3),(2,4),(3,4)][:1]):
# a.groupby(level=0).apply(lambda X: X.plot.scatter(x=x,y=y,label=X.name,ax=ax,c=c[X.name],alpha=0.995));ax.set_xlabel('PC'+str(x-1));ax.set_ylabel('PC'+str(y-1))


Out[120]:
1 2 3 4 5 ... 9 10 11 pop super_pop
0
HG01583 HG01583 0.034938 0.004044 0.013499 0.009774 ... 0.001382 0.002442 -0.055146 PJL SAS
HG01586 HG01586 0.034546 -0.003499 -0.006807 -0.009658 ... 0.000915 0.072007 0.001487 PJL SAS
HG01589 HG01589 0.036923 -0.000663 -0.012708 -0.006507 ... 0.013870 -0.024558 -0.055726 PJL SAS
HG01593 HG01593 0.040077 0.001941 0.012841 -0.003185 ... -0.029956 0.090447 0.004412 PJL SAS
HG02490 HG02490 0.037331 0.004338 0.006396 -0.000529 ... 0.070993 0.035487 0.001623 PJL SAS
HG02491 HG02491 0.037626 -0.005489 -0.012311 -0.008756 ... -0.055859 -0.013838 0.045566 PJL SAS
HG02493 HG02493 0.032607 -0.002934 -0.004812 -0.032185 ... 0.041729 0.071905 0.026529 PJL SAS
HG02494 HG02494 0.041190 -0.011722 0.006865 0.020071 ... -0.070616 0.063717 -0.065156 PJL SAS
HG02597 HG02597 0.041561 0.001634 0.014018 -0.001310 ... 0.002021 -0.000336 -0.034500 PJL SAS
HG02600 HG02600 0.031430 -0.000445 0.000535 -0.004597 ... -0.013178 0.010932 -0.009656 PJL SAS
... ... ... ... ... ... ... ... ... ... ... ...
201852639 201852639 0.003777 -0.172121 -0.015910 0.004485 ... -0.031327 0.018996 0.002497 Hyper Sick
201852657 201852657 0.012679 -0.164920 -0.012677 0.028407 ... -0.007508 0.003534 0.002443 Normo Healthy
201852655 201852655 0.014062 -0.214110 0.021282 0.062514 ... 0.290810 -0.089870 0.064078 Normo Healthy
201852656 201852656 0.010484 -0.172662 0.016141 0.028124 ... 0.004354 -0.022937 0.007470 Normo Healthy
201852659 201852659 0.022188 -0.183293 0.046131 0.025816 ... 0.070964 -0.047674 0.018805 Normo Healthy
201852645 201852645 0.002806 -0.162032 -0.013278 0.014561 ... -0.033883 0.005871 -0.033083 Normo Healthy
201852647 201852647 -0.002187 -0.163470 -0.003075 0.015539 ... -0.056372 -0.025414 -0.000933 Normo Healthy
201852644 201852644 0.001132 -0.168232 0.006505 0.002453 ... -0.045676 -0.019957 -0.022607 Normo Healthy
201852646 201852646 0.010547 -0.170871 0.016332 0.017375 ... -0.009529 0.006805 0.006444 Normo Healthy
201852658 201852658 0.004127 -0.165282 -0.001002 0.006015 ... -0.018945 0.024211 -0.013059 Normo Healthy

1026 rows × 13 columns