In [2]:
%matplotlib inline
from Bio import SeqIO
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
In [3]:
def read_parsed_igblast_file(filename):
'''Takes in a parsed IgBlastn file and returns a pandas Dataframe
Parameters:
filename - the name of the .txt file returned from the parse_igblast.py code.
'''
antibodies = pd.DataFrame(columns=['V_gene', 'J_gene', 'CDR3_seq', 'CD3_AA', 'CDR3_len'])
with open(filename, 'r') as f:
for line in f:
label, row = parse_line(line)
antibodies.loc[label] = row
return antibodies
In [4]:
def parse_line(line):
'''Takes a line from the parsed IgBlastn file and returns an ID string and an array
containing V gene, J gene, and CDR3 sequence, and CDR3 length.
Parameters:
line - the line being read from the txt file containing parsed antibody information.
'''
split_line = line.split()
label = ''
row = [None] * 5
label = split_line[0]
row[0] = split_line[2]
row[1] = split_line[4]
row[2] = split_line[9] + 'C'
row[3] = translate(row[2])
row[4] = len(row[3])
return label, row
In [2]:
def translate(seq):
'''Translates a nucleotide string to an AA string.
Parameters:
seq - the nucleotide string to be translated.
'''
codon_table = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
AA = ''
for i in range(0,len(seq),3):
if seq[i:i+3] in codon_table:
AA += codon_table[seq[i:i+3]]
return AA
In [10]:
def count_improper_CDR3s (dataframe):
'''Takes in a datframe from read_parsed_igblast_file and counts CDR3s that have a
stop codon, are shorter than 6 AAs or are longer than 25 AAs.
Prints the results.'''
count = 0
for i,row in enumerate(dataframe.iterrows()):
if '_' in datframe.iloc[i][3] or 6 < len(dataframe.iloc[i][3]) < 25:
count += 1
print ("Improper CDR3s: ", count)
print ("Total CDR3s: ", len(SRR2150229_new.index))
In [11]:
def check_clonality(patient):
'''Counts the number of CDR3s that are found in at least two sequences.
'''
CDR3s = {}
for i,row in enumerate(patient.iterrows()):
key = patient.iloc[i][3]
if key in CDR3s:
CDR3s[key] += 1
else:
CDR3s[key] = 1
for key in CDR3s:
if CDR3s[key] > 1:
print (key, " ", CDR3s[key])
return CDR3s
In [ ]: