Purpose

The purpose of this notebook is to expand the sequence dataset.

In the past, I had ignored sequences that had multiple ambiguous possibilities, which were associated with a single number. Here, I will assume that the sequences that are ambiguous are contributing equally to the predicted resistance number.



In [1]:

    
import pandas as pd
import itertools as it

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import ProteinAlphabet
from collections import Counter
from copy import deepcopy
from tqdm import tqdm

%matplotlib inline

Read in data



In [2]:

    
# Read in the genotype-phenotype data
data = pd.read_csv('data/hiv-protease-data.csv', index_col='SeqID')
seq_cols = ['P{0}'.format(i) for i in range(1,100)]
drug_cols = data.columns[0:8]
data.replace('.', '-', inplace=True)
data[seq_cols].head()









    Out[2]:






  
    
      
      P1
      P2
      P3
      P4
      P5
      P6
      P7
      P8
      P9
      P10
      ...
      P90
      P91
      P92
      P93
      P94
      P95
      P96
      P97
      P98
      P99
    
    
      SeqID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2996
      -
      -
      -
      -
      -
      -
      -
      -
      -
      I
      ...
      M
      -
      -
      L
      -
      -
      -
      -
      -
      -
    
    
      4387
      -
      -
      -
      -
      -
      -
      -
      -
      -
      -
      ...
      -
      -
      -
      -
      -
      -
      -
      -
      -
      -
    
    
      4426
      -
      -
      -
      -
      -
      -
      -
      -
      -
      I
      ...
      -
      -
      -
      L
      -
      -
      -
      -
      -
      -
    
    
      4432
      -
      -
      -
      -
      -
      -
      -
      -
      -
      -
      ...
      -
      -
      -
      -
      -
      -
      -
      -
      -
      -
    
    
      4482
      -
      -
      -
      -
      -
      -
      -
      -
      -
      V
      ...
      M
      -
      -
      L
      -
      -
      -
      -
      -
      -
    
  

5 rows × 99 columns



In [3]:

    
# Read in the consensus sequence
consensus = SeqIO.read('data/hiv-protease-consensus.fasta', 'fasta')
str(consensus.seq)









    Out[3]:





'PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF'



In [4]:

    
# Replace dashes with consensus letters.
for i, col in enumerate(seq_cols):
    data[col] = data[col].replace('-', str(consensus.seq)[i])
data[seq_cols].head()









    Out[4]:






  
    
      
      P1
      P2
      P3
      P4
      P5
      P6
      P7
      P8
      P9
      P10
      ...
      P90
      P91
      P92
      P93
      P94
      P95
      P96
      P97
      P98
      P99
    
    
      SeqID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2996
      P
      Q
      I
      T
      L
      W
      Q
      R
      P
      I
      ...
      M
      T
      Q
      L
      G
      C
      T
      L
      N
      F
    
    
      4387
      P
      Q
      I
      T
      L
      W
      Q
      R
      P
      L
      ...
      L
      T
      Q
      I
      G
      C
      T
      L
      N
      F
    
    
      4426
      P
      Q
      I
      T
      L
      W
      Q
      R
      P
      I
      ...
      L
      T
      Q
      L
      G
      C
      T
      L
      N
      F
    
    
      4432
      P
      Q
      I
      T
      L
      W
      Q
      R
      P
      L
      ...
      L
      T
      Q
      I
      G
      C
      T
      L
      N
      F
    
    
      4482
      P
      Q
      I
      T
      L
      W
      Q
      R
      P
      V
      ...
      M
      T
      Q
      L
      G
      C
      T
      L
      N
      F
    
  

5 rows × 99 columns



In [5]:

    
def number_of_combinations(row):
    nc = 1  # nc = "number of combinations"
    for i in row:
        nc = nc * len(i)
        
    return nc



In [6]:

    
data['num_combinations'] = data[seq_cols].apply(lambda x: number_of_combinations(x), axis=1)
data['has_multiple_mutations'] = data['num_combinations'] > 1
counts = Counter(data['num_combinations'])
data['num_combinations'].head()









    Out[6]:





SeqID
2996    2
4387    4
4426    1
4432    1
4482    2
Name: num_combinations, dtype: int64



In [12]:

    
counts









    Out[12]:





Counter({1: 871,
         2: 367,
         3: 4,
         4: 210,
         6: 11,
         8: 122,
         12: 14,
         16: 73,
         24: 10,
         32: 49,
         48: 7,
         64: 21,
         96: 8,
         128: 17,
         192: 2,
         256: 5,
         384: 2,
         512: 5,
         768: 2,
         1024: 2,
         2048: 2,
         2304: 1,
         3072: 1,
         9216: 1,
         16384: 1})



In [22]:

    
# We will only consider sequences for which less than 49 possible combinations may occur.
# What percentage of the data will we cover?

total = 0
for count, num in counts.items():
    if count < 49:
        total += num
print('Coverage is {0:.2f}%.'.format(total/sum(counts.values())*100))









    



Coverage is 96.13%.



In [23]:

    
# We will store this in a new variable called "filtered"
filtered = data[data['num_combinations'] < 49]
len(filtered)









    Out[23]:





1738



In [26]:

    
# Expansion of columns will be done using the custom function below.
# Pass in the entire dataframe.
def iter_row(row):
    """
    Iterates over every element in a row, and yields a list of that element.
    """
    for i in row:
        yield(list(i))

def expand_mutations(row):
    """
    Expands each row to the total number of possible combinations of sequences.
    
    Returns every combination of mutation.
    """
    return list(it.product(*iter_row(row[seq_cols])))

# Collate list of dictionaries to be used as the input to a new dataframe that contains all of the expanded mutations.
expanded_data = []
for seqid, row in tqdm(filtered.iterrows()):
    
    muts_to_consider = expand_mutations(row)
    
    for i, sequence in enumerate(muts_to_consider):
        new_data = dict()
        new_data['SeqID'] = seqid
        for drug in drug_cols:
            new_data[drug] = row[drug]

        # print(i)
        new_seq = ''
        for s in sequence:
            new_seq += s
        new_data['sequence'] = new_seq
        new_data['seqid'] = str(seqid) + '-' + str(i)
        new_data['weight'] = 1 / len(muts_to_consider)
        new_data['sequence_object'] = SeqRecord(Seq(new_seq, alphabet=ProteinAlphabet()), id='{0}-{1}'.format(seqid, i))
        expanded_data.append(new_data)
expanded_data = pd.DataFrame(expanded_data)
expanded_data.to_csv('data/hiv-protease-data-expanded.csv')



In [28]:

    
SeqIO.write(expanded_data['sequence_object'].values, 'data/hiv-protease-sequences-expanded.fasta', 'fasta')









    Out[28]:





6979



In [ ]:

	P1	P2	P3	P4	P5	P6	P7	P8	P9	P10	...	P90	P91	P92	P93	P94	P95	P96	P97	P98	P99
SeqID
2996	-	-	-	-	-	-	-	-	-	I	...	M	-	-	L	-	-	-	-	-	-
4387	-	-	-	-	-	-	-	-	-	-	...	-	-	-	-	-	-	-	-	-	-
4426	-	-	-	-	-	-	-	-	-	I	...	-	-	-	L	-	-	-	-	-	-
4432	-	-	-	-	-	-	-	-	-	-	...	-	-	-	-	-	-	-	-	-	-
4482	-	-	-	-	-	-	-	-	-	V	...	M	-	-	L	-	-	-	-	-	-