This script will convert raw genotype data to genotype tables.
In [ ]:
import sys, os, time
import argparse
from collections import Counter
import imp
import numpy as np
In [ ]:
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
def import_anywhere(module_name, paths):
"""import methods from any folder"""
try:
f, filename, desc = imp.find_module(module_name, paths)
return imp.load_module(module_name, f, filename, desc)
finally:
# Since we may exit via an exception, close fp explicitly.
if f:
f.close()
Parse arguments.
In [ ]:
parser = argparse.ArgumentParser()
parser.add_argument("case_file", help="raw case genotype data file")
parser.add_argument("control_file", help="raw control genotype data file")
parser.add_argument("outfile", help="output file")
args = parser.parse_args()
Process the raw data.
In [ ]:
utility_functions = import_anywhere('utility_functions', [SCRIPT_DIR])
from utility_functions import check_table_valid
In [ ]:
## read case data
case_data = {}
with open(args.case_file, 'r') as infile:
for line in infile:
fields = line.split()
snp_name = fields[1]
case_data[snp_name] = Counter(fields[4:])
## read control data
control_data = {}
with open(args.control_file, 'r') as infile:
for line in infile:
fields = line.split()
snp_name = fields[1]
control_data[snp_name] = Counter(fields[4:])
## combine case and control data and write as table
with open(args.outfile, 'w') as outfile:
headers = "name, case_0, case_1, case_2, ctrl_0, ctrl_1, ctrl_2".split(', ')
line_template = '\t'.join(['{}'] * len(headers)) + '\n'
outfile.write(line_template.format(*headers))
for snp_name in case_data.keys():
## check whether the input table is a 2x3 contingency table with positive margins first
input_table = np.array([[int(case_data[snp_name]['0']),
int(case_data[snp_name]['1']),
int(case_data[snp_name]['2'])],
[int(control_data[snp_name]['0'],),
int(control_data[snp_name]['1'],),
int(control_data[snp_name]['2'],)],])
if not check_table_valid(input_table):
continue
## the input table is valid. proceed.
outfile.write(line_template.format(*[snp_name,
case_data[snp_name]['0'],
case_data[snp_name]['1'],
case_data[snp_name]['2'],
control_data[snp_name]['0'],
control_data[snp_name]['1'],
control_data[snp_name]['2'],]))