In [3]:
%matplotlib inline
from Bio import SeqIO    
import numpy as np
import pandas as pd    
import matplotlib.pyplot as plt
import os
import sys
from subprocess import call

In [3]:
def run_igblast(filename):
    '''Takes a file from antidengue/data/fasta/ and runs it through igblast
    
    Parameters:
        filename - the name of the fasta
    '''
    if filename.endswith('.fasta'):
        outfile = filename[:-6] + '_out'
    else:
        outfile = filename + '_out'
    
    fasta_path = '/Users/davidglass/antidengue/data/fasta/'
    ncbi_path = '/Users/davidglass/Documents/resources/ncbi-igblast-1.4.0/'
    blast = '/Users/davidglass/Documents/resources/ncbi-igblast-1.4.0/bin/igblastn'
    igblast_dump = '/Users/davidglass/antidengue/data/ig_parse_dump/' + outfile[:-4]

    call(['cp', fasta_path + filename, ncbi_path])
    os.chdir(ncbi_path)
    call([blast, '-out', outfile, '-query', ncbi_path+filename, '-num_alignments_V=1', '-num_alignments_D=1',
          '-num_alignments_J=1', '-evalue=1e-20', '-germline_db_V', ncbi_path+'database/IGHV_imgt.fasta', 
          '-germline_db_D', ncbi_path+'database/IGHD_imgt.fasta', '-germline_db_J',
          ncbi_path+'database/IGHJ_imgt.fasta', '-domain_system', 'imgt', '-auxiliary_data',
          ncbi_path+'optional_file/human_gl.aux'])
    call(['mkdir', igblast_dump])
    call(['mv', filename, igblast_dump])
    call(['mv', outfile, igblast_dump])
    
    return outfile

In [4]:
def get_sample_info_from_csv(csv):
    '''Puts the info from the sample_info csv into a pandas dataframe and returns it
     
    Parameters:
        csv - the csv file
    '''
    
    sample_info = pd.DataFrame().from_csv(csv)
    
    return sample_info

In [5]:
def run_parse(fasta, blast_out, sample_info):
    '''Runs parse_igblast.py in the igblast_dump folder and sends the parsedfile to the by_patient directory
    
    Parameters:
        fasta - the fasta file
        blast_out - the output of igblast
        dataframe - the pandas dataframe containing patient info
    '''
    accession_num = blast_out[:-4]
    igblast_dump = '/Users/davidglass/antidengue/data/ig_parse_dump/' + accession_num
    destination_directory = '/Users/davidglass/antidengue/data/by_sample'
    patient_id = ""
    time_point = ""
    
    call(['cp', '/Users/davidglass/antidengue/src/parse_igblast.py', igblast_dump])
    os.chdir(igblast_dump)
    call(['python', 'parse_igblast.py', fasta, blast_out])
    
    # find the ID from the sample_info DataFrame
    patient_id = str(sample_info.loc[accession_num][0])
    time_point = str(sample_info.loc[accession_num][4])
    new_filename = patient_id + "_" + time_point + "_" + accession_num
    
    call(['mv', 'parsed_igblast.txt', new_filename])
    call(['mv', new_filename, destination_directory])
    call(['rm', fasta])
    call(['rm', 'parse_igblast.py'])

In [6]:
def pipeline(filename, sample_info):
    '''Runs a fasta through igblast and then parses the output and properly sorts files
    
    Parameters:
        filename - the name of the fasta
        sample_info - the pandas dataframe containing patient info
    '''

    blast_out = run_igblast(filename)
    run_parse(filename, blast_out, sample_info)

In [7]:
sample_info = get_sample_info_from_csv('/Users/davidglass/antidengue/data/sample_info.csv')

In [12]:
# Runs the pipeline on all the files

dirs = os.listdir('/Users/davidglass/antidengue/data/fasta/')
    
for file in dirs:
    if file.endswith('.fasta'):
        pipeline(file, sample_info)

In [7]:
# Renames the files with proper labels

dirs = os.listdir('/Users/davidglass/antidengue/data/by_sample/')
os.chdir('/Users/davidglass/antidengue/data/by_sample/')
    
for file in dirs:
    name = file
    first = name.find('_')
    sec = name.find('_', first + 1)
    name = "patient_" + name[:first] + "_time" + name[first:sec] + "_sample" + name[sec:]
    call(['mv', file, name])

In [11]:
# Replaces header with amended header.

header = "###sequence_id	abundance	V-gene	D-gene	J-gene	V_E-value	D_E-value	" + \
        "J_E-value	FR3_seq	CDR3_seq	FR4_seq	const_seq	len_FR3	len_CDR3	len_FR4	" + \
        "len_const	sequence_boundary_indices:FR1_CDR1_FR2_CDR2_FR3_CDR3_FR4	len_boundary	" + \
        "stop_codon_present	productive_sequence	AA_seq_whole_read	mutation_positions	" + \
        "germline_bases	derived_bases	mutation_density	V_germline_identity	leader_seq	" + \
        "reads_per_molecule	primer_isotype	sequence_reversed	seq	quality###\n"

dirs = os.listdir('/Users/davidglass/antidengue/data/by_sample/')
lines = []
for file in dirs:
    if (file.startswith('patient')):
        with open (file, 'r') as f:
            lines = f.readlines()
        lines[0] = header
        with open (file, 'w') as f:
            f.writelines(lines)

In [39]:
def write_seq_and_quality(filename):
    '''Takes a filename from a parsed_ig file, finds the corresponding fastq, and appends the
    input file with sequence and quality information for each sequence.
    
    Parameters:
        filename - the name of the parsed_igfile
    '''
    fastq_directory = '/Users/davidglass/antidengue/data/fastq/'
    by_samples_directory = '/Users/davidglass/antidengue/data/by_sample/'
    
    last_underscore = filename.rfind('_')
    fastq_file = filename[last_underscore + 1:] + '.fastq'
    fastq_lines = []
    parsed_lines = []
    
    with open (by_samples_directory + filename, 'r') as fi:
        parsed_lines = fi.readlines()
    with open (fastq_directory + fastq_file, 'r') as fi:
        fastq_lines = fi.readlines()
    for i in range(len(parsed_lines)):
        if i != 0:
            parsed_lines[i] = parsed_lines[i].strip('\n') + "\t" + \
                                fastq_lines[(i*4)-3].strip('\n') + "\t" + fastq_lines[(i*4)-1]
    with open (by_samples_directory + filename, 'w') as fo:
        for line in parsed_lines:
            fo.write(line)

In [40]:
write_seq_and_quality('patient_148_time_ACUTE_sample_SRR2150126')

In [43]:
dirs = os.listdir('/Users/davidglass/antidengue/data/by_sample/')
for file in dirs:
    if (file.startswith('patient')):
        write_seq_and_quality(file)

In [ ]: