In [1]:
import pandas as pd
import os, os.path
from itertools import islice
from StringIO import StringIO
os.chdir('/home/will/Documents/MATLAB/')
In [2]:
def treat_sample_lines(lines):
out_data = []
titles = None
for line in lines:
line = line.replace('"', '').strip()
if not line.startswith('!Sample'):
break
elif line.startswith('!Sample_title'):
titles = line.split('\t')[1:]
elif line.startswith('!Sample_characteristics_ch1'):
kv_pairs = line.split('\t')
for num, (pair, sample) in enumerate(zip(kv_pairs[1:], titles)):
try:
key, val = pair.split(':',1)
except ValueError:
continue
out_data.append((sample, key, val))
elif line.startswith('!Sample_'):
parts = line.split('\t')
key = parts[0].replace('!Sample_', '')
for val, sample in zip(parts[1:], titles):
out_data.append((sample, key, val))
return out_data
In [3]:
def load_data(fname):
sample_char_lines = []
with open(fname) as handle:
for line in handle:
if len(line.strip()) == 0:
pass
elif line.startswith('!Series'):
pass
elif line.startswith('!Sample'):
sample_char_lines.append(line.strip())
elif line.startswith('!series_matrix_table_begin'):
micro_data = pd.read_csv(StringIO(''.join(handle)), index_col=0, delimiter='\t')
break
out_data = treat_sample_lines(sample_char_lines)
char_data = pd.DataFrame(out_data, columns = ['Sample', 'Measurement', 'Value'])
nchar_data = pd.pivot_table(char_data, rows = 'Sample', cols = 'Measurement', values = 'Value', aggfunc = 'first')
return micro_data, nchar_data
micro_a, char_a = load_data('GSE1133-GPL96_series_matrix.txt')
In [4]:
groups = [('brain', set(['globus_pallidus', 'ParietalLobe', 'brainThalamus', 'Medulla_Oblongata',
'CerebellumPeduncles', 'OlfactoryBulb', 'OccipitalLobe','Pons',
'brain, caudate nucleus', 'Pituitary', 'wholebrain', 'PrefrontalCortex',
'Hypothalamus', 'TemporalLobe', 'Brain Amygdala', 'AdrenalCortex', 'pituitary',
'CingulateCortex', 'cerebellum', 'Whole Brain'])),
('muscle', set(['SmoothMuscle', 'Skeletal_Muscle_Psoas', 'HEART', 'CardiacMyocytes',
'atrioventricular_node', 'Heart'])),
('blood', set(['peripheral blood-CD8TCells', 'WHOLEBLOOD', 'peripheral blood-CD19BCells',
'peripheral blood-CD14Monocytes', 'peripheral blood-BDCA4DentriticCells',
'peripheral blood-CD56NKCells', 'peripheral blood-BDCA4DentriticCells',
'peripheral blood-CD4TCells', '721_BLymphoblasts', ])),
('bone marrow', set(['bone marrow', 'bonemarrow', 'bone marrow-CD34',
'bone marrow-CD105Endothelial', 'bone marrow-CD33Myeloid',
'bone marrow-CD71EarlyErythroid']))
]
char_a['Group'] = ''
for name, s in groups:
mask = char_a['description'].map(lambda x: x in s)
char_a['Group'][mask] = name
char_a.set_index('geo_accession', inplace=True)
In [5]:
import csv
with open('GSE1133_family.soft') as handle:
for line in handle:
if line.startswith('!platform_table_begin'):
break
go_data = []
for row in csv.DictReader(handle, delimiter='\t'):
gid = row['ID']
if row['Gene Ontology Biological Process'] is not None:
for parts in row['Gene Ontology Biological Process'].split(' /// '):
try:
_, name, _ = parts.split(' // ')
go_data.append((gid, name))
except ValueError:
pass
len(go_data)
Out[5]:
In [14]:
gene_var = micro_a.var(axis = 1)
wanted_gene_rows = gene_var.rank(ascending=False) < 2000
wanted_data = micro_a[wanted_gene_rows]
wanted_genes = set(wanted_gene_rows[wanted_gene_rows].index)
In [23]:
wanted_char = char_a[['description']].ix[[x.strip() for x in wanted_data.columns]]
wanted_char.to_csv('micro_group_names.csv')
wanted_data.to_csv('micro_data.csv', index=False, header=False)
with open('gene_ids.csv', 'w') as handle:
handle.write('\n'.join(wanted_data.index))
with open('micro_headers.csv', 'w') as handle:
handle.write('\n'.join(wanted_data.columns))
with open('go_data.csv', 'w') as handle:
writer = csv.writer(handle, delimiter='\t')
wanted_genes = set(wanted_data.index)
output_already = set()
for gid, name in go_data:
if (gid in wanted_genes) and ((gid, name) not in output_already):
writer.writerow((gid, name))
output_already.add((gid, name))
In [ ]: