In [2]:
%matplotlib inline
from Bio import SeqIO    
import numpy as np
import pandas as pd    
import matplotlib.pyplot as plt
import os
import sys

In [3]:
def read_parsed_igblast_file(filename):
    '''Takes in a parsed IgBlastn file and returns a pandas Dataframe
    
    Parameters:
        filename - the name of the .txt file returned from the parse_igblast.py code.
    '''
    
    antibodies = pd.DataFrame(columns=['V_gene', 'J_gene', 'CDR3_seq', 'CD3_AA', 'CDR3_len'])
    
    with open(filename, 'r') as f:
        for line in f:
            label, row = parse_line(line)
            antibodies.loc[label] = row
            
    return antibodies

In [4]:
def parse_line(line):
    '''Takes a line from the parsed IgBlastn file and returns an ID string and an array 
    containing V gene, J gene, and CDR3 sequence, and CDR3 length.
    
    Parameters:
        line - the line being read from the txt file containing parsed antibody information.
    '''
    
    split_line = line.split()
    label = ''
    row = [None] * 5

    label = split_line[0]
    row[0] = split_line[2]
    row[1] = split_line[4]
    row[2] = split_line[9] + 'C'
    row[3] = translate(row[2])
    row[4] = len(row[3])
    
    return label, row

In [2]:
def translate(seq):
    '''Translates a nucleotide string to an AA string.
    
    Parameters:
        seq - the nucleotide string to be translated.
    '''
    
    codon_table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
    }
    
    AA = ''
    for i in range(0,len(seq),3):
        if seq[i:i+3] in codon_table:
            AA += codon_table[seq[i:i+3]]
    return AA

In [10]:
def count_improper_CDR3s (dataframe):
    '''Takes in a datframe from read_parsed_igblast_file and counts CDR3s that have a 
    stop codon, are shorter than 6 AAs or are longer than 25 AAs.
    Prints the results.'''
    count = 0
    for i,row in enumerate(dataframe.iterrows()):
        if '_' in datframe.iloc[i][3] or 6 < len(dataframe.iloc[i][3]) < 25:
            count += 1

    print ("Improper CDR3s: ", count)
    print ("Total CDR3s: ", len(SRR2150229_new.index))

In [11]:
def check_clonality(patient):
    '''Counts the number of CDR3s that are found in at least two sequences.
    '''
    CDR3s = {}
    for i,row in enumerate(patient.iterrows()):
        key = patient.iloc[i][3]
        if key in CDR3s:
            CDR3s[key] += 1
        else:
            CDR3s[key] = 1
    
    for key in CDR3s:
        if CDR3s[key] > 1:
            print (key, " ", CDR3s[key])
    return CDR3s

In [ ]: