In [1]:
import os
import numpy as np
import theano
import re


Using gpu device 0: GeForce GTX 1070 (CNMeM is enabled with initial size: 75.0% of memory, cuDNN 5005)

In [2]:
os.getcwd()


Out[2]:
'/home/gluca/SoftwareProjects/Thesis/MDBN/notebooks'

In [3]:
file='../data/AML/AML_gene_espression2.csv'

In [4]:
patient = dict()
p = re.compile('TCGA-[^-]+-[0-9]+')
with open(file) as f:
    record = f.readline().split(',')
    while True:
        record = f.readline().split(',')
        if record[0] == '':
            break;
        m = p.match(record[0])
        patient_id=m.group()
        if not patient_id in patient:
            patient[patient_id] = dict()
        if record[1] != 'sample':
            patient[patient_id][record[1]] = float(record[2])

Number of patients


In [5]:
len(patient.keys())


Out[5]:
173

Number of genes


In [6]:
for k,v in patient.iteritems():
    genes=v.keys()
    print(len(genes))
    break


20531

In [7]:
patient_list = []
with open('../data/AML/pat_id.txt','r') as f:
    f.readline() # skip the header
    while True:
        record = f.readline().strip()
        if record == '':
            break;
        patient_list.append(record)

In [8]:
file2 = '../data/AML/AML_gene_expression_table2.csv'

In [9]:
with open(file2,'w') as out:
    out.write('ID')
    for patient_id in patient_list:
        out.write('\t')
        out.write(patient_id)
    out.write('\n')
    for gene in genes:
        out.write(gene)
        for patient_id in patient_list:
            out.write('\t')
            out.write(str(patient[patient_id][gene]))
        out.write('\n')
    out.close()

In [ ]: