In [14]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import sys,os
path='/'.join(os.getcwd().split('/')[:-4])
sys.path.insert(1,path)
import Utils.Util as utl
import Utils.Plots as pplt
import pandas as pd
pd.options.display.max_rows = 20;
pd.options.display.expand_frame_repr = True
from IPython.display import display
import pylab as plt
import seaborn as sns

In [60]:
import Utils.Plots as pplt
reload(pplt)
a=pd.read_excel('~/storage/Data/Human/Andes/info/FROM_Haddad_original.xlsx').rename(columns={'CMS/Non-CMS':'y'}).set_index('y').iloc[:,[3,4]]#.set_index('CMS/Non-CMS')
a
sns.pairplot(a.reset_index(),hue='y',size=5)
plt.figure()
a.groupby('y').size().plot.bar()


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f71ab94a610>

In [ ]:


In [97]:
f=lambda x: ((x['CMS/Non-CMS']=='CMS')& (x.Hct>60)) | ((x['CMS/Non-CMS']=='Non-CMS')& (x.Hct<60))
id=pd.read_excel('~/storage/Data/Human/Andes/info/Haddad_sample_info_20151030_emily.xlsx').iloc[:,[1,2]].dropna().applymap(utl.INT).set_index('Subject ID').iloc[:,0].rename('sample')
b=pd.read_excel('~/storage/Data/Human/Andes/info/FROM_Haddad_original.xlsx').set_index('SampleID').join(id)
i= f(b)
b=b[i].reset_index()[['SampleID','CMS/Non-CMS','Gender','sample']]
b['super_pop']='AND'
b['pop']=b['CMS/Non-CMS'].replace({'CMS':'SIK','Non-CMS':'HLT'})
b['gender']=b.Gender.replace({'M':'male','F':'female'})
b=b[['sample','pop','super_pop','gender']]
b
b.to_csv('/home/arya/storage/Data/Human/Andes/Andean_HLI_BAM_VCF/hg19/snp/noINFO/byChr/norm/merge/PASS/noChr/panel.filtered',index=False,sep='\t')
id.loc[i[~i].index].to_csv('/home/arya/storage/Data/Human/Andes/Andean_HLI_BAM_VCF/hg19/snp/noINFO/byChr/norm/merge/PASS/noChr/filter',index=False,sep='\t')
# pd.read_excel('~/storage/Data/Human/Andes/info/FROM_Haddad_original.xlsx')

In [86]:



Out[86]:
SampleID
1CP-A      187524478
48CP-A     187524432
110CP-A    187524404
15CP-G     187524388
Name: Sample ID, dtype: int64

In [59]:
I=((a.index=='CMS')& (a.Hct>60)) | ((a.index=='Non-CMS')& (a.Hct<60))

sns.pairplot(a[I].reset_index(),hue='y',size=5)
# print I.sum()
(~I).groupby(level=0).sum()
a[I]

# a.loc[['non-CMS']].Hct<60
a[I]


Out[59]:
SampleID Gender Age Hct CMS score Notes
y
Non-CMS 3CP-A M 44 54.0 7 NaN
Non-CMS 4CP-A M 40 53.0 4 NaN
Non-CMS 5CP-A M 33 50.0 8 NaN
CMS 8CP-A M 58 69.0 21 NaN
CMS 9CP-A M 25 64.0 18 NaN
CMS 10CP-A M 55 67.0 16 NaN
CMS 11CP-A M 68 66.0 16 NaN
Non-CMS 12CP-A M 26 55.0 4 NaN
Non-CMS 13CP-A M 44 50.5 8 NaN
CMS 15CP-A M 46 75.0 18 NaN
... ... ... ... ... ... ...
Non-CMS 39CP-G M 30 52.0 2 NaN
CMS 41CP-G M 43 62.0 10 NaN
Non-CMS 50CP-G F 45 48.0 4 NaN
CMS 52CP-G M 64 82.5 14 NaN
CMS 69CP-G M 40 68.0 9 NaN
Non-CMS 80CP-G M 36 49.0 1 NaN
Non-CMS 84CP-G M 32 53.0 0 NaN
Non-CMS 89CP-G F 45 43.0 4 NaN
Non-CMS 94CP-G M 26 54.0 4 NaN
CMS 101CP-G M 68 66.0 16 NaN

96 rows × 6 columns