In [62]:
import sys
import pandas as pd
In [63]:
filenames_patients = ["raw/patient_A.csv", "raw/patient_B.csv", "raw/patient_C.csv", "raw/patient_D.csv",
"raw/patient_E.csv", "raw/patient_F.csv", "raw/patient_G.csv"]#, "raw/patient_H.csv"]
filenames_samples = ["raw/samples_A.txt", "raw/samples_B.txt", "raw/samples_C.txt", "raw/samples_D.txt",
"raw/samples_E.txt", "raw/samples_F.txt", "raw/samples_G.txt"]#, "raw/samples_H.txt"]
filenames_out = ["reads_A.tsv", "reads_B.tsv", "reads_C.tsv", "reads_D.tsv",
"reads_E.tsv", "reads_F.tsv", "reads_G.tsv"]#, "reads_H.tsv"]
In [64]:
def parse_samples(filename):
samples = []
with open(filename) as f:
for line in f:
s = line.rstrip("\n").split(" ")
samples.append(s[0])
return samples
In [65]:
def process(patient_filename, samples_filename, output_filename):
samples = parse_samples(samples_filename)
print patient_filename, samples
df = pd.read_csv(patient_filename)
# filter out SNVs affected by CNAs
df = df[df['cna_primary'] == '(1,1)']
for i in range(1, len(samples)):
df = df[df['cna_met%d' % i] == '(1,1)']
# filter out SNVs that were filtered out by Sanborn
df = df[df['class'] != "N.A."]
with open(output_filename, "w") as f:
f.write("%d #anatomical sites\n" % len(samples))
f.write("%d #samples\n" % len(samples))
f.write("%d #mutations\n" % len(df))
f.write("\t".join(["#sample_index", "sample_label", "anatomical_site_index",
"anatomical_site_label", "character_index", "character_label",
"ref", "var"]) + "\n")
i = 0
for index, row in df.iterrows():
for p, sample in enumerate(samples):
machina_row = []
machina_row.append(str(p))
machina_row.append(sample)
machina_row.append(str(p))
machina_row.append(sample)
machina_row.append(str(i))
machina_row.append(row['gene'] + ":" + str(row['chromosome']) + ":" + str(row['start']))
if p == 0:
total = row['tot_primary']
alt = row['alt_primary']
else:
total = row['tot_met%d' % p]
alt = row['alt_met%d' % p]
machina_row.append(str(total - alt))
machina_row.append(str(alt))
f.write("\t".join(machina_row) + "\n")
i += 1
return df
In [66]:
for i in range(len(filenames_patients)):
process(filenames_patients[i], filenames_samples[i], filenames_out[i])
In [ ]: