In [1]:
import pandas as pd
import numpy as np
from Bio import AlignIO, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import Counter
from Levenshtein import distance
from copy import deepcopy
from joblib import Parallel, delayed

In [2]:
protease_consensus = SeqIO.read('sequences/hiv-protease-consensus.fasta', 'fasta')
protease_sequences = [s for s in SeqIO.parse('sequences/HIV-1_pol-protease.fasta', 'fasta') if len(s.seq) >= len(protease_consensus.seq)]
rt_consensus = SeqIO.read('sequences/hiv-rt-consensus.fasta', 'fasta')
rt_sequences = [s for s in SeqIO.parse('sequences/HIV-1_pol-RT.fasta', 'fasta') if len(s.seq) >= len(rt_consensus.seq)]

In [3]:
len(rt_consensus)


Out[3]:
560

In [4]:
Counter([len(s) for s in rt_sequences])


Out[4]:
Counter({560: 624,
         561: 26,
         562: 224,
         563: 1,
         564: 2,
         565: 1,
         566: 5,
         567: 1,
         568: 1,
         569: 5,
         570: 10,
         571: 16,
         573: 5,
         574: 1,
         575: 1,
         578: 4,
         579: 1,
         580: 2,
         581: 2,
         582: 2,
         583: 7,
         584: 20,
         585: 6,
         586: 5,
         587: 5,
         588: 8,
         589: 8,
         590: 8,
         591: 12,
         592: 20,
         593: 15,
         594: 47,
         595: 54,
         596: 24,
         597: 21,
         598: 26,
         599: 43,
         600: 9,
         601: 13,
         602: 8,
         603: 2,
         604: 9,
         605: 6,
         606: 8,
         607: 7,
         608: 3,
         609: 4,
         610: 1,
         611: 2,
         612: 3,
         613: 2,
         615: 5,
         616: 1,
         617: 2,
         618: 4,
         619: 4,
         620: 2,
         621: 1,
         622: 1,
         623: 3,
         624: 1,
         625: 1,
         627: 1,
         628: 3,
         629: 2,
         630: 1,
         631: 1,
         632: 8,
         633: 65,
         634: 7,
         635: 4,
         636: 5,
         637: 4,
         639: 3,
         640: 4,
         641: 2,
         642: 3,
         643: 2,
         644: 1,
         646: 2,
         647: 1,
         649: 1,
         650: 6,
         651: 9,
         652: 1,
         653: 4,
         654: 3,
         655: 6,
         656: 6,
         657: 159,
         658: 29,
         659: 922,
         660: 1,
         661: 1,
         663: 3,
         664: 3,
         665: 2,
         666: 3,
         667: 3,
         668: 10,
         669: 43,
         670: 97,
         671: 2,
         672: 5,
         673: 6,
         674: 2,
         675: 1,
         676: 2,
         677: 2,
         678: 1,
         679: 1,
         680: 2,
         683: 2,
         684: 1,
         685: 1,
         686: 2,
         687: 2,
         688: 1,
         689: 4,
         690: 11,
         691: 21,
         692: 6,
         693: 1,
         694: 3,
         695: 2,
         696: 1,
         697: 3,
         698: 4,
         699: 2,
         701: 4,
         702: 1,
         703: 2,
         704: 1,
         705: 1,
         707: 2,
         708: 2,
         710: 2,
         711: 2,
         713: 1,
         714: 1,
         716: 2,
         717: 3,
         719: 1,
         722: 1,
         725: 3,
         726: 1,
         727: 1,
         733: 1,
         740: 3,
         741: 22,
         746: 5,
         759: 1,
         760: 1,
         761: 1,
         763: 1,
         768: 2,
         770: 2,
         771: 1,
         775: 2,
         781: 2,
         785: 1,
         806: 1,
         811: 1,
         819: 1,
         822: 1,
         827: 1,
         831: 1,
         847: 4,
         848: 484,
         849: 1,
         857: 1,
         862: 1,
         864: 1,
         867: 2,
         868: 1,
         870: 1,
         871: 1,
         873: 1,
         877: 1,
         881: 3,
         882: 1,
         888: 1,
         896: 1,
         897: 2,
         900: 3,
         902: 2,
         905: 6,
         906: 2,
         907: 2,
         908: 1,
         909: 1,
         910: 1,
         912: 2,
         913: 1,
         914: 3,
         915: 1,
         917: 1,
         919: 1,
         920: 3,
         923: 1,
         927: 2,
         928: 1,
         929: 1,
         931: 3,
         933: 3,
         934: 2,
         936: 2,
         937: 4,
         938: 3,
         939: 5,
         940: 2,
         941: 1,
         942: 1,
         943: 6,
         944: 6,
         945: 21,
         946: 2,
         947: 118,
         949: 5,
         950: 70,
         951: 3,
         952: 3,
         954: 1,
         955: 1,
         957: 1,
         959: 1,
         960: 1,
         961: 2,
         962: 5,
         963: 7,
         964: 12,
         965: 16,
         966: 3,
         967: 6,
         968: 1,
         969: 2,
         971: 7,
         972: 59,
         973: 8,
         974: 5,
         975: 8,
         976: 70,
         977: 12,
         978: 45,
         979: 16,
         980: 38,
         981: 441,
         982: 8,
         983: 115,
         984: 171,
         985: 122,
         986: 40,
         987: 286,
         988: 163,
         989: 128,
         990: 86,
         991: 157,
         992: 96,
         993: 172,
         994: 82,
         995: 89,
         996: 200,
         997: 96,
         998: 245,
         999: 1248,
         1000: 752,
         1001: 433,
         1002: 1021,
         1003: 3570,
         1004: 332,
         1005: 352,
         1006: 546,
         1007: 217,
         1008: 209,
         1009: 104,
         1010: 90,
         1011: 55,
         1012: 31,
         1013: 34,
         1014: 10,
         1015: 57,
         1016: 4,
         1017: 9,
         1018: 4,
         1019: 9,
         1021: 3,
         1022: 1,
         1032: 1})

In [10]:
Counter([len(s) for s in protease_sequences])


Out[10]:
Counter({99: 23164,
         100: 1185,
         101: 18,
         102: 41,
         103: 148,
         104: 146,
         105: 38,
         106: 23,
         107: 47,
         108: 87,
         109: 138,
         110: 26,
         111: 145,
         112: 19,
         113: 90,
         114: 278,
         115: 229,
         116: 25,
         117: 26,
         118: 40,
         119: 123,
         120: 12,
         121: 29,
         122: 61,
         123: 10,
         124: 12,
         125: 13,
         126: 18,
         127: 27,
         128: 39,
         129: 134,
         130: 25,
         131: 118,
         132: 63,
         133: 41,
         134: 9,
         135: 12,
         136: 7,
         137: 13,
         138: 14,
         139: 12,
         140: 29,
         141: 37,
         142: 43,
         143: 19,
         144: 17,
         145: 29,
         146: 19,
         147: 16,
         148: 13,
         149: 30,
         150: 30,
         151: 288,
         152: 63,
         153: 50,
         154: 194,
         155: 361,
         156: 259,
         157: 65,
         158: 79,
         159: 66,
         160: 34,
         161: 59,
         162: 44,
         163: 56,
         164: 57,
         165: 93,
         166: 92,
         167: 34,
         168: 39,
         169: 62,
         170: 42,
         171: 64,
         172: 22,
         173: 46,
         174: 74,
         175: 53,
         176: 8,
         177: 6,
         178: 5,
         179: 12,
         180: 4,
         181: 5,
         182: 2,
         183: 4,
         184: 3,
         185: 5,
         186: 6,
         187: 4,
         188: 3,
         190: 1,
         191: 2,
         192: 5,
         193: 6,
         194: 18,
         195: 30,
         196: 3,
         197: 27,
         198: 13,
         199: 6,
         200: 9,
         201: 15,
         202: 4,
         203: 3,
         204: 5,
         205: 3,
         207: 3,
         208: 4,
         209: 1,
         210: 2,
         213: 2,
         214: 1,
         215: 4,
         217: 1,
         218: 2,
         219: 5,
         220: 4,
         221: 4,
         222: 1,
         223: 9,
         225: 3,
         226: 6,
         227: 2,
         229: 24,
         230: 14,
         231: 88,
         232: 10,
         233: 18,
         234: 7,
         235: 4,
         236: 3,
         237: 4,
         238: 39,
         239: 4,
         240: 18,
         241: 2,
         242: 2,
         243: 2,
         244: 4,
         245: 4,
         246: 3,
         247: 5,
         248: 3,
         249: 1,
         252: 2,
         253: 1,
         254: 3,
         255: 2,
         257: 1,
         258: 2,
         259: 2,
         260: 3,
         261: 2,
         262: 6,
         263: 2,
         265: 1,
         266: 4,
         267: 5,
         268: 11,
         269: 11,
         270: 2,
         271: 6,
         272: 71,
         273: 3,
         274: 4,
         275: 1,
         276: 4,
         277: 1,
         278: 8,
         279: 3,
         280: 10,
         281: 2,
         282: 2,
         283: 3,
         284: 3,
         285: 6,
         286: 3,
         287: 7,
         288: 48,
         289: 10,
         290: 4,
         291: 2,
         292: 11,
         293: 4,
         294: 10,
         295: 291,
         296: 15,
         297: 16,
         298: 82,
         299: 60,
         300: 11,
         301: 20,
         302: 11,
         303: 3,
         304: 53,
         305: 6,
         306: 15,
         307: 7,
         308: 14,
         309: 21,
         310: 14,
         311: 20,
         312: 87,
         313: 313,
         314: 134,
         315: 59,
         316: 51,
         317: 280,
         318: 93,
         319: 450,
         320: 135,
         321: 298,
         322: 264,
         323: 1264,
         324: 94,
         325: 158,
         326: 139,
         327: 434,
         328: 4403,
         329: 1059,
         330: 79,
         331: 153,
         332: 1036,
         333: 767,
         334: 670,
         335: 874,
         336: 408,
         337: 891,
         338: 360,
         339: 3377,
         340: 383,
         341: 1658,
         342: 794,
         343: 768,
         344: 307,
         345: 553,
         346: 1926,
         347: 503,
         348: 496,
         349: 494,
         350: 833,
         351: 1737,
         352: 1037,
         353: 846,
         354: 702,
         355: 165,
         356: 679,
         357: 578,
         358: 575,
         359: 443,
         360: 426,
         361: 520,
         362: 758,
         363: 155,
         364: 221,
         365: 492,
         366: 764,
         367: 295,
         368: 1245,
         369: 263,
         370: 269,
         371: 196,
         372: 127,
         373: 293,
         374: 166,
         375: 162,
         376: 306,
         377: 247,
         378: 226,
         379: 549,
         380: 220,
         381: 370,
         382: 204,
         383: 172,
         384: 210,
         385: 221,
         386: 119,
         387: 442,
         388: 259,
         389: 159,
         390: 192,
         391: 140,
         392: 348,
         393: 158,
         394: 152,
         395: 438,
         396: 221,
         397: 864,
         398: 1070,
         399: 1105,
         400: 572,
         401: 415,
         402: 944,
         403: 1811,
         404: 9582,
         405: 187,
         406: 157,
         407: 137,
         408: 117,
         409: 426,
         410: 79,
         411: 142,
         412: 156,
         413: 1877,
         414: 123,
         415: 107,
         416: 159,
         417: 124,
         418: 353,
         419: 434,
         420: 589,
         421: 244,
         422: 496,
         423: 1546,
         424: 356,
         425: 273,
         426: 210,
         427: 162,
         428: 150,
         429: 328,
         430: 133,
         431: 442,
         432: 1142,
         433: 3266,
         434: 14764,
         435: 348,
         436: 122,
         437: 281,
         438: 164,
         439: 413,
         440: 365,
         441: 161,
         442: 139,
         443: 101,
         444: 174,
         445: 109,
         446: 102,
         447: 107,
         448: 98,
         449: 104,
         450: 146,
         451: 168,
         452: 215,
         453: 135,
         454: 110,
         455: 162,
         456: 157,
         457: 182,
         458: 164,
         459: 108,
         460: 41,
         461: 42,
         462: 50,
         463: 69,
         464: 22,
         465: 25,
         466: 80,
         467: 25,
         468: 27,
         469: 40,
         470: 253,
         471: 19,
         472: 27,
         473: 34,
         474: 26,
         475: 50,
         476: 24,
         477: 29,
         478: 17,
         479: 56,
         480: 25,
         481: 30,
         482: 44,
         483: 36,
         484: 36,
         485: 58,
         486: 59,
         487: 65,
         488: 243,
         489: 21,
         490: 20,
         491: 21,
         492: 41,
         493: 63,
         494: 99,
         495: 76,
         496: 138,
         497: 339,
         498: 838,
         499: 8129,
         500: 263,
         501: 118,
         502: 39,
         503: 28,
         504: 40,
         505: 151,
         506: 35,
         507: 17,
         508: 20,
         509: 27,
         510: 61,
         511: 22,
         512: 11,
         513: 15,
         514: 11,
         515: 12,
         516: 45,
         517: 12,
         518: 11,
         519: 19,
         520: 28,
         521: 17,
         522: 30,
         523: 125,
         524: 223,
         525: 120,
         526: 50,
         527: 11,
         528: 9,
         529: 13,
         530: 130,
         531: 14,
         532: 27,
         533: 27,
         534: 46,
         535: 46,
         536: 37,
         537: 147,
         538: 54,
         539: 84,
         540: 175,
         541: 17,
         542: 44,
         543: 82,
         544: 132,
         545: 67,
         546: 42,
         547: 90,
         548: 39,
         549: 15,
         550: 147,
         551: 123,
         552: 42,
         553: 27,
         554: 11,
         555: 30,
         556: 29,
         557: 10,
         558: 17,
         559: 17,
         560: 30,
         561: 20,
         562: 21,
         563: 11,
         564: 11,
         565: 12,
         566: 9,
         567: 4,
         568: 7,
         569: 10,
         570: 5,
         571: 8,
         572: 10,
         573: 14,
         574: 23,
         575: 16,
         576: 14,
         577: 17,
         578: 24,
         579: 32,
         580: 89,
         581: 47,
         582: 74,
         583: 82,
         584: 127,
         585: 113,
         586: 140,
         587: 128,
         588: 120,
         589: 99,
         590: 98,
         591: 73,
         592: 79,
         593: 50,
         594: 80,
         595: 69,
         596: 40,
         597: 32,
         598: 38,
         599: 43,
         600: 12,
         601: 17,
         602: 8,
         603: 2,
         604: 9,
         605: 7,
         606: 8,
         607: 7,
         608: 3,
         609: 4,
         610: 1,
         611: 2,
         612: 3,
         613: 2,
         615: 5,
         616: 1,
         617: 2,
         618: 4,
         619: 4,
         620: 2,
         621: 1,
         622: 1,
         623: 2,
         624: 1,
         625: 1,
         627: 1,
         628: 3,
         629: 2,
         630: 1,
         631: 1,
         632: 8,
         633: 65,
         634: 7,
         635: 4,
         636: 5,
         637: 4,
         639: 3,
         640: 4,
         641: 2,
         642: 3,
         643: 2,
         644: 1,
         646: 2,
         647: 1,
         649: 1,
         650: 6,
         651: 9,
         652: 1,
         653: 3,
         654: 3,
         655: 6,
         656: 6,
         657: 158,
         658: 26,
         659: 922,
         660: 1,
         661: 1,
         663: 3,
         664: 3,
         665: 2,
         666: 3,
         667: 3,
         668: 10,
         669: 41,
         670: 97,
         671: 2,
         672: 5,
         673: 6,
         674: 1,
         675: 1,
         676: 2,
         677: 2,
         678: 1,
         679: 1,
         680: 2,
         683: 2,
         684: 1,
         685: 1,
         686: 2,
         687: 2,
         688: 1,
         689: 4,
         690: 11,
         691: 21,
         692: 6,
         693: 1,
         694: 3,
         695: 2,
         696: 1,
         697: 3,
         698: 4,
         699: 2,
         701: 4,
         702: 1,
         703: 2,
         705: 1,
         707: 2,
         708: 2,
         710: 2,
         711: 2,
         713: 1,
         714: 1,
         716: 1,
         717: 3,
         719: 1,
         725: 1,
         726: 1,
         727: 1,
         733: 1,
         740: 1,
         746: 5,
         759: 1,
         763: 1,
         768: 2,
         770: 2,
         771: 1,
         775: 2,
         785: 1,
         806: 1,
         811: 1,
         819: 1,
         822: 1,
         831: 1,
         847: 1,
         857: 1,
         862: 1,
         864: 1,
         867: 2,
         868: 1,
         870: 1,
         871: 1,
         873: 1,
         881: 1,
         882: 1,
         888: 1,
         896: 1,
         897: 2,
         900: 3,
         902: 1,
         905: 5,
         906: 2,
         907: 2,
         908: 1,
         910: 1,
         912: 2,
         913: 1,
         914: 2,
         915: 1,
         917: 1,
         919: 1,
         920: 1,
         923: 1,
         927: 1,
         929: 1,
         931: 2,
         933: 1,
         934: 1,
         937: 1,
         938: 3,
         939: 2,
         940: 1,
         943: 2,
         944: 1,
         945: 4,
         946: 2,
         947: 118,
         949: 5,
         950: 70,
         951: 3,
         952: 3,
         954: 1,
         955: 1,
         957: 1,
         959: 1,
         960: 1,
         961: 2,
         962: 5,
         963: 7,
         964: 12,
         965: 16,
         966: 3,
         967: 6,
         968: 1,
         969: 2,
         971: 7,
         972: 59,
         973: 8,
         974: 5,
         975: 8,
         976: 70,
         977: 12,
         978: 45,
         979: 16,
         980: 38,
         981: 441,
         982: 8,
         983: 115,
         984: 171,
         985: 122,
         986: 40,
         987: 286,
         988: 163,
         989: 128,
         990: 86,
         991: 157,
         992: 96,
         993: 172,
         994: 82,
         995: 89,
         996: 200,
         997: 96,
         998: 245,
         999: 1248,
         1000: 752,
         1001: 433,
         1002: 1021,
         1003: 3570,
         1004: 332,
         1005: 352,
         1006: 546,
         1007: 217,
         1008: 209,
         1009: 104,
         1010: 90,
         1011: 55,
         1012: 31,
         1013: 34,
         1014: 10,
         1015: 57,
         1016: 4,
         1017: 9,
         1018: 4,
         1019: 9,
         1021: 3,
         1022: 1,
         1032: 1})

In [5]:
# Define a function to extract out the protease sequence, even if it's not equal to consensus.

def find_best_match(consensus, query):
    """
    Pass in two SeqRecords. The conversion to strings will be done inside the function.
    """
    query_str = str(query.seq) # <- this is greater than 99 a.a. long.

    min_dist = np.inf
    best_pos = 0
    for pos in range(len(query_str)):
        d_current = distance(str(consensus.seq), str(query_str[pos:pos + len(consensus.seq)]))
        if d_current < min_dist:
            min_dist = d_current
            best_pos = pos
          
    matching_query = SeqRecord(seq=query.seq[best_pos:best_pos+len(consensus.seq)], id=query.id, description=query.description, name=query.name)
    return matching_query

In [6]:
cleaned_protease_sequences = Parallel(n_jobs=-1)(delayed(find_best_match)(protease_consensus, s) for s in protease_sequences)
cleaned_protease_sequences = [s for s in cleaned_protease_sequences if len(s.seq) == len(protease_consensus)]

In [7]:
cleaned_rt_sequences = Parallel(n_jobs=-1)(delayed(find_best_match)(rt_consensus, s) for s in rt_sequences)
cleaned_rt_sequences = [s for s in cleaned_rt_sequences if len(s.seq) == len(rt_consensus)]

In [11]:
SeqIO.write(cleaned_protease_sequences, 'sequences/HIV1-protease.fasta', 'fasta')


Out[11]:
156223

In [8]:
SeqIO.write(cleaned_rt_sequences, 'sequences/HIV1-RT.fasta', 'fasta')


Out[8]:
15018

In [9]:
protease_sequences[101236].seq[26:26+99]


Out[9]:
Seq('PQITLWQRPLVTIKIGGQIKEALLDTGADDTVLEDMDLPGRWKPKMIGGIGGFI...LNF', SingleLetterAlphabet())

In [12]:
cleaned_protease_sequences[0:5]


Out[12]:
[SeqRecord(seq=Seq('PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFI...LNF', SingleLetterAlphabet()), id='B.FR.1983.IIIB_LAI.A04321', name='B.FR.1983.IIIB_LAI.A04321', description='B.FR.1983.IIIB_LAI.A04321', dbxrefs=[]),
 SeqRecord(seq=Seq('PQITLWQRPLVAIKIGGQLKEALLDTGADDTVLEEMNLPGKWKPKMIGGIGGFI...LNF', SingleLetterAlphabet()), id='D.CD.1983.ELI_patent.A07108', name='D.CD.1983.ELI_patent.A07108', description='D.CD.1983.ELI_patent.A07108', dbxrefs=[]),
 SeqRecord(seq=Seq('PQITLWQRPVVTVRVGGQLKEALLDTGADDTVLEEINLPGKWKPKMIGGIGGFI...LNF', SingleLetterAlphabet()), id='A1DK.CD.1985.MAL_patent.A07116', name='A1DK.CD.1985.MAL_patent.A07116', description='A1DK.CD.1985.MAL_patent.A07116', dbxrefs=[]),
 SeqRecord(seq=Seq('PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFI...LNF', SingleLetterAlphabet()), id='B.FR.1983.LAI-J19.A07867', name='B.FR.1983.LAI-J19.A07867', description='B.FR.1983.LAI-J19.A07867', dbxrefs=[]),
 SeqRecord(seq=Seq('PQITLWQRPLVAIKIGGQLKEALLDTGADDTVLEEMNLPGKWKPKMIGGIGGFI...LNF', SingleLetterAlphabet()), id='D.CD.1983.ELI_patent.A14116', name='D.CD.1983.ELI_patent.A14116', description='D.CD.1983.ELI_patent.A14116', dbxrefs=[])]

In [ ]: