In [3]:
%matplotlib inline
from Bio import SeqIO
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
from subprocess import call
In [3]:
def run_igblast(filename):
'''Takes a file from antidengue/data/fasta/ and runs it through igblast
Parameters:
filename - the name of the fasta
'''
if filename.endswith('.fasta'):
outfile = filename[:-6] + '_out'
else:
outfile = filename + '_out'
fasta_path = '/Users/davidglass/antidengue/data/fasta/'
ncbi_path = '/Users/davidglass/Documents/resources/ncbi-igblast-1.4.0/'
blast = '/Users/davidglass/Documents/resources/ncbi-igblast-1.4.0/bin/igblastn'
igblast_dump = '/Users/davidglass/antidengue/data/ig_parse_dump/' + outfile[:-4]
call(['cp', fasta_path + filename, ncbi_path])
os.chdir(ncbi_path)
call([blast, '-out', outfile, '-query', ncbi_path+filename, '-num_alignments_V=1', '-num_alignments_D=1',
'-num_alignments_J=1', '-evalue=1e-20', '-germline_db_V', ncbi_path+'database/IGHV_imgt.fasta',
'-germline_db_D', ncbi_path+'database/IGHD_imgt.fasta', '-germline_db_J',
ncbi_path+'database/IGHJ_imgt.fasta', '-domain_system', 'imgt', '-auxiliary_data',
ncbi_path+'optional_file/human_gl.aux'])
call(['mkdir', igblast_dump])
call(['mv', filename, igblast_dump])
call(['mv', outfile, igblast_dump])
return outfile
In [4]:
def get_sample_info_from_csv(csv):
'''Puts the info from the sample_info csv into a pandas dataframe and returns it
Parameters:
csv - the csv file
'''
sample_info = pd.DataFrame().from_csv(csv)
return sample_info
In [5]:
def run_parse(fasta, blast_out, sample_info):
'''Runs parse_igblast.py in the igblast_dump folder and sends the parsedfile to the by_patient directory
Parameters:
fasta - the fasta file
blast_out - the output of igblast
dataframe - the pandas dataframe containing patient info
'''
accession_num = blast_out[:-4]
igblast_dump = '/Users/davidglass/antidengue/data/ig_parse_dump/' + accession_num
destination_directory = '/Users/davidglass/antidengue/data/by_sample'
patient_id = ""
time_point = ""
call(['cp', '/Users/davidglass/antidengue/src/parse_igblast.py', igblast_dump])
os.chdir(igblast_dump)
call(['python', 'parse_igblast.py', fasta, blast_out])
# find the ID from the sample_info DataFrame
patient_id = str(sample_info.loc[accession_num][0])
time_point = str(sample_info.loc[accession_num][4])
new_filename = patient_id + "_" + time_point + "_" + accession_num
call(['mv', 'parsed_igblast.txt', new_filename])
call(['mv', new_filename, destination_directory])
call(['rm', fasta])
call(['rm', 'parse_igblast.py'])
In [6]:
def pipeline(filename, sample_info):
'''Runs a fasta through igblast and then parses the output and properly sorts files
Parameters:
filename - the name of the fasta
sample_info - the pandas dataframe containing patient info
'''
blast_out = run_igblast(filename)
run_parse(filename, blast_out, sample_info)
In [7]:
sample_info = get_sample_info_from_csv('/Users/davidglass/antidengue/data/sample_info.csv')
In [12]:
# Runs the pipeline on all the files
dirs = os.listdir('/Users/davidglass/antidengue/data/fasta/')
for file in dirs:
if file.endswith('.fasta'):
pipeline(file, sample_info)
In [7]:
# Renames the files with proper labels
dirs = os.listdir('/Users/davidglass/antidengue/data/by_sample/')
os.chdir('/Users/davidglass/antidengue/data/by_sample/')
for file in dirs:
name = file
first = name.find('_')
sec = name.find('_', first + 1)
name = "patient_" + name[:first] + "_time" + name[first:sec] + "_sample" + name[sec:]
call(['mv', file, name])
In [11]:
# Replaces header with amended header.
header = "###sequence_id abundance V-gene D-gene J-gene V_E-value D_E-value " + \
"J_E-value FR3_seq CDR3_seq FR4_seq const_seq len_FR3 len_CDR3 len_FR4 " + \
"len_const sequence_boundary_indices:FR1_CDR1_FR2_CDR2_FR3_CDR3_FR4 len_boundary " + \
"stop_codon_present productive_sequence AA_seq_whole_read mutation_positions " + \
"germline_bases derived_bases mutation_density V_germline_identity leader_seq " + \
"reads_per_molecule primer_isotype sequence_reversed seq quality###\n"
dirs = os.listdir('/Users/davidglass/antidengue/data/by_sample/')
lines = []
for file in dirs:
if (file.startswith('patient')):
with open (file, 'r') as f:
lines = f.readlines()
lines[0] = header
with open (file, 'w') as f:
f.writelines(lines)
In [39]:
def write_seq_and_quality(filename):
'''Takes a filename from a parsed_ig file, finds the corresponding fastq, and appends the
input file with sequence and quality information for each sequence.
Parameters:
filename - the name of the parsed_igfile
'''
fastq_directory = '/Users/davidglass/antidengue/data/fastq/'
by_samples_directory = '/Users/davidglass/antidengue/data/by_sample/'
last_underscore = filename.rfind('_')
fastq_file = filename[last_underscore + 1:] + '.fastq'
fastq_lines = []
parsed_lines = []
with open (by_samples_directory + filename, 'r') as fi:
parsed_lines = fi.readlines()
with open (fastq_directory + fastq_file, 'r') as fi:
fastq_lines = fi.readlines()
for i in range(len(parsed_lines)):
if i != 0:
parsed_lines[i] = parsed_lines[i].strip('\n') + "\t" + \
fastq_lines[(i*4)-3].strip('\n') + "\t" + fastq_lines[(i*4)-1]
with open (by_samples_directory + filename, 'w') as fo:
for line in parsed_lines:
fo.write(line)
In [40]:
write_seq_and_quality('patient_148_time_ACUTE_sample_SRR2150126')
In [43]:
dirs = os.listdir('/Users/davidglass/antidengue/data/by_sample/')
for file in dirs:
if (file.startswith('patient')):
write_seq_and_quality(file)
In [ ]: