In [1]:
import pandas as pd
import os, os.path
from itertools import islice
from StringIO import StringIO
os.chdir('/home/will/Documents/MATLAB/')

In [2]:
def treat_sample_lines(lines):
    
    out_data = []
    titles = None
    for line in lines:
        line = line.replace('"', '').strip()
        if not line.startswith('!Sample'):
            break
        elif line.startswith('!Sample_title'):
            titles = line.split('\t')[1:]
        elif line.startswith('!Sample_characteristics_ch1'):
            kv_pairs = line.split('\t')
            for num, (pair, sample) in enumerate(zip(kv_pairs[1:], titles)):
                try:
                    key, val = pair.split(':',1)
                except ValueError:
                    continue
                out_data.append((sample, key, val))
        elif line.startswith('!Sample_'):
            parts = line.split('\t')
            key = parts[0].replace('!Sample_', '')
            for val, sample in zip(parts[1:], titles):
                out_data.append((sample, key, val))
    return out_data

In [3]:
def load_data(fname):
    
    sample_char_lines = []
    
    with open(fname) as handle:
        for line in handle:
            if len(line.strip()) == 0:
                pass
            elif line.startswith('!Series'):
                pass
            elif line.startswith('!Sample'):
                sample_char_lines.append(line.strip())
            elif line.startswith('!series_matrix_table_begin'):
                micro_data = pd.read_csv(StringIO(''.join(handle)), index_col=0, delimiter='\t')
                break
                
    
    out_data = treat_sample_lines(sample_char_lines)
    char_data = pd.DataFrame(out_data, columns = ['Sample', 'Measurement', 'Value'])
    nchar_data = pd.pivot_table(char_data, rows = 'Sample', cols = 'Measurement', values = 'Value', aggfunc = 'first')
    
    return micro_data, nchar_data
    
micro_a, char_a = load_data('GSE1133-GPL96_series_matrix.txt')

In [4]:
groups = [('brain', set(['globus_pallidus', 'ParietalLobe', 'brainThalamus', 'Medulla_Oblongata', 
                         'CerebellumPeduncles', 'OlfactoryBulb', 'OccipitalLobe','Pons', 
                         'brain, caudate nucleus', 'Pituitary', 'wholebrain', 'PrefrontalCortex',
                         'Hypothalamus', 'TemporalLobe', 'Brain Amygdala', 'AdrenalCortex', 'pituitary', 
                         'CingulateCortex', 'cerebellum', 'Whole Brain'])),
          ('muscle', set(['SmoothMuscle', 'Skeletal_Muscle_Psoas', 'HEART', 'CardiacMyocytes',
                          'atrioventricular_node', 'Heart'])),
          ('blood', set(['peripheral blood-CD8TCells', 'WHOLEBLOOD', 'peripheral blood-CD19BCells',
                         'peripheral blood-CD14Monocytes', 'peripheral blood-BDCA4DentriticCells',
                         'peripheral blood-CD56NKCells', 'peripheral blood-BDCA4DentriticCells', 
                         'peripheral blood-CD4TCells', '721_BLymphoblasts', ])),
          ('bone marrow', set(['bone marrow', 'bonemarrow', 'bone marrow-CD34', 
                               'bone marrow-CD105Endothelial', 'bone marrow-CD33Myeloid',
                               'bone marrow-CD71EarlyErythroid']))
          ]

char_a['Group'] = ''
for name, s in groups:
    mask = char_a['description'].map(lambda x: x in s)
    char_a['Group'][mask] = name
    
char_a.set_index('geo_accession', inplace=True)

In [5]:
import csv
with open('GSE1133_family.soft') as handle:
    for line in handle:
        if line.startswith('!platform_table_begin'):
            break
    go_data = []
    for row in csv.DictReader(handle, delimiter='\t'):
        gid = row['ID']
        if row['Gene Ontology Biological Process'] is not None:
            for parts in row['Gene Ontology Biological Process'].split(' /// '):
                try:
                    _, name, _ = parts.split(' // ')
                    go_data.append((gid, name))
                except ValueError:
                    pass
            
len(go_data)


Out[5]:
138257

In [14]:
gene_var = micro_a.var(axis = 1)
wanted_gene_rows = gene_var.rank(ascending=False) < 2000
wanted_data = micro_a[wanted_gene_rows]
wanted_genes = set(wanted_gene_rows[wanted_gene_rows].index)

In [23]:
wanted_char = char_a[['description']].ix[[x.strip() for x in wanted_data.columns]]
wanted_char.to_csv('micro_group_names.csv')
wanted_data.to_csv('micro_data.csv', index=False, header=False)
with open('gene_ids.csv', 'w') as handle:
    handle.write('\n'.join(wanted_data.index))
with open('micro_headers.csv', 'w') as handle:
    handle.write('\n'.join(wanted_data.columns))
    
with open('go_data.csv', 'w') as handle:
    writer = csv.writer(handle, delimiter='\t')
    wanted_genes = set(wanted_data.index)
    output_already = set()
    for gid, name in go_data:
        if (gid in wanted_genes) and ((gid, name) not in output_already):
            writer.writerow((gid, name))
            output_already.add((gid, name))

In [ ]: