In [1]:
import NotebookImport
from Imports import *
In [2]:
from bs4 import BeautifulSoup
from urllib2 import HTTPError
In [3]:
PATH_TO_CACERT = '/cellar/users/agross/cacert.pem'
In [4]:
out_path = OUT_PATH + '/MAFs_new_2/'
if not os.path.isdir(out_path):
os.makedirs(out_path)
In [5]:
maf_dashboard = 'https://confluence.broadinstitute.org/display/GDAC/MAF+Dashboard'
In [6]:
!curl --cacert $PATH_TO_CACERT $maf_dashboard -o tmp.html
In [7]:
f = open('tmp.html', 'rb').read()
soup = BeautifulSoup(f)
In [8]:
r = [l.get('href') for l in soup.find_all('a')
if l.get('href') != None
and '.maf' in l.get('href')]
In [11]:
t = pd.read_table(f, nrows=10, sep='not_real_term', header=None, squeeze=True,
engine='python')
In [54]:
cols = ['Hugo_Symbol', 'NCBI_Build', 'Chromosome', 'Start_position',
'End_position', 'Strand', 'Reference_Allele',
'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2',
'Tumor_Sample_Barcode', 'Protein_Change',
'Variant_Classification','Variant_Type']
In [55]:
maf = {}
for f in r:
try:
t = pd.read_table(f, nrows=10, sep='not_real_term', header=None,
squeeze=True,
engine='python')
skip = t.apply(lambda s: s.startswith('#'))
skip = list(skip[skip==True].index)
h = pd.read_table(f, header=0, index_col=None, skiprows=skip,
engine='python', nrows=0)
cc = list(h.columns.intersection(cols))
maf[f] = pd.read_table(f, header=0, index_col=None,
skiprows=skip,
engine='c',
usecols=cc)
except HTTPError:
print f
In [56]:
m2 = pd.concat(maf)
m3 = m2.dropna(axis=1, how='all')
In [57]:
m4 = m3[cols]
m4 = m4.reset_index()
#m4.index = map(lambda s: s.split('/')[-1], m4.index)
m4 = m4.drop_duplicates(subset=['Hugo_Symbol','Tumor_Sample_Barcode','Start_position'])
m4 = m4.reset_index()
In [58]:
m4.to_csv(out_path + 'mega_maf.csv')
In [59]:
m5 = m4.ix[m4.Variant_Classification != 'Silent']
cc = m5.groupby(['Hugo_Symbol','Tumor_Sample_Barcode']).size()
cc = cc.reset_index()
In [60]:
cc.to_csv(out_path + 'meta.csv')
In [62]:
cc.shape
Out[62]: