In [1]:
import os
import numpy as np
import theano
import re
In [2]:
os.getcwd()
Out[2]:
In [3]:
file='../data/AML/AML_gene_espression2.csv'
In [4]:
patient = dict()
p = re.compile('TCGA-[^-]+-[0-9]+')
with open(file) as f:
record = f.readline().split(',')
while True:
record = f.readline().split(',')
if record[0] == '':
break;
m = p.match(record[0])
patient_id=m.group()
if not patient_id in patient:
patient[patient_id] = dict()
if record[1] != 'sample':
patient[patient_id][record[1]] = float(record[2])
Number of patients
In [5]:
len(patient.keys())
Out[5]:
Number of genes
In [6]:
for k,v in patient.iteritems():
genes=v.keys()
print(len(genes))
break
In [7]:
patient_list = []
with open('../data/AML/pat_id.txt','r') as f:
f.readline() # skip the header
while True:
record = f.readline().strip()
if record == '':
break;
patient_list.append(record)
In [8]:
file2 = '../data/AML/AML_gene_expression_table2.csv'
In [9]:
with open(file2,'w') as out:
out.write('ID')
for patient_id in patient_list:
out.write('\t')
out.write(patient_id)
out.write('\n')
for gene in genes:
out.write(gene)
for patient_id in patient_list:
out.write('\t')
out.write(str(patient[patient_id][gene]))
out.write('\n')
out.close()
In [ ]: