The purpose of this notebook is to expand the sequence dataset.
In the past, I had ignored sequences that had multiple ambiguous possibilities, which were associated with a single number. Here, I will assume that the sequences that are ambiguous are contributing equally to the predicted resistance number.
In [1]:
import pandas as pd
import itertools as it
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import ProteinAlphabet
from collections import Counter
from copy import deepcopy
from tqdm import tqdm
%matplotlib inline
In [2]:
# Read in the genotype-phenotype data
data = pd.read_csv('data/hiv-protease-data.csv', index_col='SeqID')
seq_cols = ['P{0}'.format(i) for i in range(1,100)]
drug_cols = data.columns[0:8]
data.replace('.', '-', inplace=True)
data[seq_cols].head()
Out[2]:
In [3]:
# Read in the consensus sequence
consensus = SeqIO.read('data/hiv-protease-consensus.fasta', 'fasta')
str(consensus.seq)
Out[3]:
In [4]:
# Replace dashes with consensus letters.
for i, col in enumerate(seq_cols):
data[col] = data[col].replace('-', str(consensus.seq)[i])
data[seq_cols].head()
Out[4]:
In [5]:
def number_of_combinations(row):
nc = 1 # nc = "number of combinations"
for i in row:
nc = nc * len(i)
return nc
In [6]:
data['num_combinations'] = data[seq_cols].apply(lambda x: number_of_combinations(x), axis=1)
data['has_multiple_mutations'] = data['num_combinations'] > 1
counts = Counter(data['num_combinations'])
data['num_combinations'].head()
Out[6]:
In [12]:
counts
Out[12]:
In [22]:
# We will only consider sequences for which less than 49 possible combinations may occur.
# What percentage of the data will we cover?
total = 0
for count, num in counts.items():
if count < 49:
total += num
print('Coverage is {0:.2f}%.'.format(total/sum(counts.values())*100))
In [23]:
# We will store this in a new variable called "filtered"
filtered = data[data['num_combinations'] < 49]
len(filtered)
Out[23]:
In [26]:
# Expansion of columns will be done using the custom function below.
# Pass in the entire dataframe.
def iter_row(row):
"""
Iterates over every element in a row, and yields a list of that element.
"""
for i in row:
yield(list(i))
def expand_mutations(row):
"""
Expands each row to the total number of possible combinations of sequences.
Returns every combination of mutation.
"""
return list(it.product(*iter_row(row[seq_cols])))
# Collate list of dictionaries to be used as the input to a new dataframe that contains all of the expanded mutations.
expanded_data = []
for seqid, row in tqdm(filtered.iterrows()):
muts_to_consider = expand_mutations(row)
for i, sequence in enumerate(muts_to_consider):
new_data = dict()
new_data['SeqID'] = seqid
for drug in drug_cols:
new_data[drug] = row[drug]
# print(i)
new_seq = ''
for s in sequence:
new_seq += s
new_data['sequence'] = new_seq
new_data['seqid'] = str(seqid) + '-' + str(i)
new_data['weight'] = 1 / len(muts_to_consider)
new_data['sequence_object'] = SeqRecord(Seq(new_seq, alphabet=ProteinAlphabet()), id='{0}-{1}'.format(seqid, i))
expanded_data.append(new_data)
expanded_data = pd.DataFrame(expanded_data)
expanded_data.to_csv('data/hiv-protease-data-expanded.csv')
In [28]:
SeqIO.write(expanded_data['sequence_object'].values, 'data/hiv-protease-sequences-expanded.fasta', 'fasta')
Out[28]:
In [ ]: