In [1]:
import os
import numpy as np
import theano
import re


Using gpu device 0: GeForce GTX 1070 (CNMeM is enabled with initial size: 75.0% of memory, cuDNN 5005)

In [2]:
os.getcwd()


Out[2]:
'/home/gluca/SoftwareProjects/Thesis/MDBN/notebooks'

In [3]:
file='../data/AML/Patient_MutatedGenes_somatic.NoNorm.txt'

In [4]:
patient = dict()
p = re.compile('TCGA-[^-]+-[0-9]+')
with open(file) as f:
    record = f.readline().split('\t')
    while True:
        record = f.readline().split('\t')
        if record[0] == '':
            break;
        m = p.match(record[0])
        patient_id=m.group()
        if not patient_id in patient:
            patient[patient_id] = dict()
        if record[1] != 'sample':
            patient[patient_id][record[1]] = float(record[2])

Number of patients


In [5]:
len(patient.keys())


Out[5]:
197

Number of genes


In [6]:
genes=[]
for k,v in patient.iteritems():
    for gene in v.keys():
        if not gene in genes:
            genes.append(gene)
print(len(genes))


1890

In [7]:
patient_list = []
with open('../data/AML/pat_id.txt','r') as f:
    f.readline() # skip the header
    while True:
        record = f.readline().strip()
        if record == '':
            break;
        patient_list.append(record)

In [8]:
file2 = '../data/AML/AML_somatic_mutations_table2.csv'

In [9]:
with open(file2,'w') as out:
    out.write('ID')
    for patient_id in patient_list:
        out.write('\t')
        out.write(patient_id)
    out.write('\n')
    for gene in genes:
        out.write(gene)
        for patient_id in patient_list:
            out.write('\t')
            if gene in patient[patient_id]:
                out.write(str(patient[patient_id][gene]))
            else:
                out.write("0.0")
        out.write('\n')
    out.close()

In [ ]: