In [62]:
import sys
import pandas as pd

In [63]:
filenames_patients = ["raw/patient_A.csv", "raw/patient_B.csv", "raw/patient_C.csv", "raw/patient_D.csv",
                      "raw/patient_E.csv", "raw/patient_F.csv", "raw/patient_G.csv"]#, "raw/patient_H.csv"]
filenames_samples = ["raw/samples_A.txt", "raw/samples_B.txt", "raw/samples_C.txt", "raw/samples_D.txt",
                     "raw/samples_E.txt", "raw/samples_F.txt", "raw/samples_G.txt"]#, "raw/samples_H.txt"]
filenames_out = ["reads_A.tsv", "reads_B.tsv", "reads_C.tsv", "reads_D.tsv",
                 "reads_E.tsv", "reads_F.tsv", "reads_G.tsv"]#, "reads_H.tsv"]

In [64]:
def parse_samples(filename):
    samples = []
    with open(filename) as f:
        for line in f:
            s = line.rstrip("\n").split(" ")
            samples.append(s[0])

    return samples

In [65]:
def process(patient_filename, samples_filename, output_filename):
    samples = parse_samples(samples_filename)
    print patient_filename, samples
    df = pd.read_csv(patient_filename)

    # filter out SNVs affected by CNAs
    df = df[df['cna_primary'] == '(1,1)']
    for i in range(1, len(samples)):
        df = df[df['cna_met%d' % i] == '(1,1)']
    
    # filter out SNVs that were filtered out by Sanborn
    df = df[df['class'] != "N.A."]
        
    with open(output_filename, "w") as f:
        f.write("%d #anatomical sites\n" % len(samples))
        f.write("%d #samples\n" % len(samples))
        f.write("%d #mutations\n" % len(df))
        f.write("\t".join(["#sample_index", "sample_label", "anatomical_site_index", 
                           "anatomical_site_label", "character_index", "character_label", 
                           "ref", "var"]) + "\n")
        i = 0
        for index, row in df.iterrows():
            for p, sample in enumerate(samples):
                machina_row = []
                machina_row.append(str(p))
                machina_row.append(sample)
                machina_row.append(str(p))
                machina_row.append(sample)
                machina_row.append(str(i))
                machina_row.append(row['gene'] + ":" + str(row['chromosome']) + ":" + str(row['start']))
                
                if p == 0:
                    total = row['tot_primary']
                    alt = row['alt_primary']
                else:
                    total = row['tot_met%d' % p]
                    alt = row['alt_met%d' % p]
                
                machina_row.append(str(total - alt))
                machina_row.append(str(alt))
                
                f.write("\t".join(machina_row) + "\n")
                
            i += 1
        
    return df

In [66]:
for i in range(len(filenames_patients)):
    process(filenames_patients[i], filenames_samples[i], filenames_out[i])


raw/patient_A.csv ['primary', 'parotid_gland', 'locoregional_1', 'locoregional_2']
raw/patient_B.csv ['primary', 'lymph_node', 'locoregional_1', 'locoregional_2']
raw/patient_C.csv ['primary', 'locoregional_1', 'locoregional_2']
raw/patient_D.csv ['primary', 'lymph_node', 'locoregional_1', 'locoregional_2']
raw/patient_E.csv ['primary', 'locoregional_1', 'locoregional_2', 'lymph_node', 'locoregional_3']
raw/patient_F.csv ['primary', 'lymph_node', 'locoregional', 'distant']
raw/patient_G.csv ['primary', 'lung', 'locoregional']

In [ ]: