In [1]:
import pandas as pd
import numpy as np
from Bio import AlignIO, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import Counter
from Levenshtein import distance
from copy import deepcopy
from joblib import Parallel, delayed
In [2]:
protease_consensus = SeqIO.read('sequences/hiv-protease-consensus.fasta', 'fasta')
protease_sequences = [s for s in SeqIO.parse('sequences/HIV-1_pol-protease.fasta', 'fasta') if len(s.seq) >= len(protease_consensus.seq)]
rt_consensus = SeqIO.read('sequences/hiv-rt-consensus.fasta', 'fasta')
rt_sequences = [s for s in SeqIO.parse('sequences/HIV-1_pol-RT.fasta', 'fasta') if len(s.seq) >= len(rt_consensus.seq)]
In [3]:
len(rt_consensus)
Out[3]:
560
In [4]:
Counter([len(s) for s in rt_sequences])
Out[4]:
Counter({560: 624,
561: 26,
562: 224,
563: 1,
564: 2,
565: 1,
566: 5,
567: 1,
568: 1,
569: 5,
570: 10,
571: 16,
573: 5,
574: 1,
575: 1,
578: 4,
579: 1,
580: 2,
581: 2,
582: 2,
583: 7,
584: 20,
585: 6,
586: 5,
587: 5,
588: 8,
589: 8,
590: 8,
591: 12,
592: 20,
593: 15,
594: 47,
595: 54,
596: 24,
597: 21,
598: 26,
599: 43,
600: 9,
601: 13,
602: 8,
603: 2,
604: 9,
605: 6,
606: 8,
607: 7,
608: 3,
609: 4,
610: 1,
611: 2,
612: 3,
613: 2,
615: 5,
616: 1,
617: 2,
618: 4,
619: 4,
620: 2,
621: 1,
622: 1,
623: 3,
624: 1,
625: 1,
627: 1,
628: 3,
629: 2,
630: 1,
631: 1,
632: 8,
633: 65,
634: 7,
635: 4,
636: 5,
637: 4,
639: 3,
640: 4,
641: 2,
642: 3,
643: 2,
644: 1,
646: 2,
647: 1,
649: 1,
650: 6,
651: 9,
652: 1,
653: 4,
654: 3,
655: 6,
656: 6,
657: 159,
658: 29,
659: 922,
660: 1,
661: 1,
663: 3,
664: 3,
665: 2,
666: 3,
667: 3,
668: 10,
669: 43,
670: 97,
671: 2,
672: 5,
673: 6,
674: 2,
675: 1,
676: 2,
677: 2,
678: 1,
679: 1,
680: 2,
683: 2,
684: 1,
685: 1,
686: 2,
687: 2,
688: 1,
689: 4,
690: 11,
691: 21,
692: 6,
693: 1,
694: 3,
695: 2,
696: 1,
697: 3,
698: 4,
699: 2,
701: 4,
702: 1,
703: 2,
704: 1,
705: 1,
707: 2,
708: 2,
710: 2,
711: 2,
713: 1,
714: 1,
716: 2,
717: 3,
719: 1,
722: 1,
725: 3,
726: 1,
727: 1,
733: 1,
740: 3,
741: 22,
746: 5,
759: 1,
760: 1,
761: 1,
763: 1,
768: 2,
770: 2,
771: 1,
775: 2,
781: 2,
785: 1,
806: 1,
811: 1,
819: 1,
822: 1,
827: 1,
831: 1,
847: 4,
848: 484,
849: 1,
857: 1,
862: 1,
864: 1,
867: 2,
868: 1,
870: 1,
871: 1,
873: 1,
877: 1,
881: 3,
882: 1,
888: 1,
896: 1,
897: 2,
900: 3,
902: 2,
905: 6,
906: 2,
907: 2,
908: 1,
909: 1,
910: 1,
912: 2,
913: 1,
914: 3,
915: 1,
917: 1,
919: 1,
920: 3,
923: 1,
927: 2,
928: 1,
929: 1,
931: 3,
933: 3,
934: 2,
936: 2,
937: 4,
938: 3,
939: 5,
940: 2,
941: 1,
942: 1,
943: 6,
944: 6,
945: 21,
946: 2,
947: 118,
949: 5,
950: 70,
951: 3,
952: 3,
954: 1,
955: 1,
957: 1,
959: 1,
960: 1,
961: 2,
962: 5,
963: 7,
964: 12,
965: 16,
966: 3,
967: 6,
968: 1,
969: 2,
971: 7,
972: 59,
973: 8,
974: 5,
975: 8,
976: 70,
977: 12,
978: 45,
979: 16,
980: 38,
981: 441,
982: 8,
983: 115,
984: 171,
985: 122,
986: 40,
987: 286,
988: 163,
989: 128,
990: 86,
991: 157,
992: 96,
993: 172,
994: 82,
995: 89,
996: 200,
997: 96,
998: 245,
999: 1248,
1000: 752,
1001: 433,
1002: 1021,
1003: 3570,
1004: 332,
1005: 352,
1006: 546,
1007: 217,
1008: 209,
1009: 104,
1010: 90,
1011: 55,
1012: 31,
1013: 34,
1014: 10,
1015: 57,
1016: 4,
1017: 9,
1018: 4,
1019: 9,
1021: 3,
1022: 1,
1032: 1})
In [10]:
Counter([len(s) for s in protease_sequences])
Out[10]:
Counter({99: 23164,
100: 1185,
101: 18,
102: 41,
103: 148,
104: 146,
105: 38,
106: 23,
107: 47,
108: 87,
109: 138,
110: 26,
111: 145,
112: 19,
113: 90,
114: 278,
115: 229,
116: 25,
117: 26,
118: 40,
119: 123,
120: 12,
121: 29,
122: 61,
123: 10,
124: 12,
125: 13,
126: 18,
127: 27,
128: 39,
129: 134,
130: 25,
131: 118,
132: 63,
133: 41,
134: 9,
135: 12,
136: 7,
137: 13,
138: 14,
139: 12,
140: 29,
141: 37,
142: 43,
143: 19,
144: 17,
145: 29,
146: 19,
147: 16,
148: 13,
149: 30,
150: 30,
151: 288,
152: 63,
153: 50,
154: 194,
155: 361,
156: 259,
157: 65,
158: 79,
159: 66,
160: 34,
161: 59,
162: 44,
163: 56,
164: 57,
165: 93,
166: 92,
167: 34,
168: 39,
169: 62,
170: 42,
171: 64,
172: 22,
173: 46,
174: 74,
175: 53,
176: 8,
177: 6,
178: 5,
179: 12,
180: 4,
181: 5,
182: 2,
183: 4,
184: 3,
185: 5,
186: 6,
187: 4,
188: 3,
190: 1,
191: 2,
192: 5,
193: 6,
194: 18,
195: 30,
196: 3,
197: 27,
198: 13,
199: 6,
200: 9,
201: 15,
202: 4,
203: 3,
204: 5,
205: 3,
207: 3,
208: 4,
209: 1,
210: 2,
213: 2,
214: 1,
215: 4,
217: 1,
218: 2,
219: 5,
220: 4,
221: 4,
222: 1,
223: 9,
225: 3,
226: 6,
227: 2,
229: 24,
230: 14,
231: 88,
232: 10,
233: 18,
234: 7,
235: 4,
236: 3,
237: 4,
238: 39,
239: 4,
240: 18,
241: 2,
242: 2,
243: 2,
244: 4,
245: 4,
246: 3,
247: 5,
248: 3,
249: 1,
252: 2,
253: 1,
254: 3,
255: 2,
257: 1,
258: 2,
259: 2,
260: 3,
261: 2,
262: 6,
263: 2,
265: 1,
266: 4,
267: 5,
268: 11,
269: 11,
270: 2,
271: 6,
272: 71,
273: 3,
274: 4,
275: 1,
276: 4,
277: 1,
278: 8,
279: 3,
280: 10,
281: 2,
282: 2,
283: 3,
284: 3,
285: 6,
286: 3,
287: 7,
288: 48,
289: 10,
290: 4,
291: 2,
292: 11,
293: 4,
294: 10,
295: 291,
296: 15,
297: 16,
298: 82,
299: 60,
300: 11,
301: 20,
302: 11,
303: 3,
304: 53,
305: 6,
306: 15,
307: 7,
308: 14,
309: 21,
310: 14,
311: 20,
312: 87,
313: 313,
314: 134,
315: 59,
316: 51,
317: 280,
318: 93,
319: 450,
320: 135,
321: 298,
322: 264,
323: 1264,
324: 94,
325: 158,
326: 139,
327: 434,
328: 4403,
329: 1059,
330: 79,
331: 153,
332: 1036,
333: 767,
334: 670,
335: 874,
336: 408,
337: 891,
338: 360,
339: 3377,
340: 383,
341: 1658,
342: 794,
343: 768,
344: 307,
345: 553,
346: 1926,
347: 503,
348: 496,
349: 494,
350: 833,
351: 1737,
352: 1037,
353: 846,
354: 702,
355: 165,
356: 679,
357: 578,
358: 575,
359: 443,
360: 426,
361: 520,
362: 758,
363: 155,
364: 221,
365: 492,
366: 764,
367: 295,
368: 1245,
369: 263,
370: 269,
371: 196,
372: 127,
373: 293,
374: 166,
375: 162,
376: 306,
377: 247,
378: 226,
379: 549,
380: 220,
381: 370,
382: 204,
383: 172,
384: 210,
385: 221,
386: 119,
387: 442,
388: 259,
389: 159,
390: 192,
391: 140,
392: 348,
393: 158,
394: 152,
395: 438,
396: 221,
397: 864,
398: 1070,
399: 1105,
400: 572,
401: 415,
402: 944,
403: 1811,
404: 9582,
405: 187,
406: 157,
407: 137,
408: 117,
409: 426,
410: 79,
411: 142,
412: 156,
413: 1877,
414: 123,
415: 107,
416: 159,
417: 124,
418: 353,
419: 434,
420: 589,
421: 244,
422: 496,
423: 1546,
424: 356,
425: 273,
426: 210,
427: 162,
428: 150,
429: 328,
430: 133,
431: 442,
432: 1142,
433: 3266,
434: 14764,
435: 348,
436: 122,
437: 281,
438: 164,
439: 413,
440: 365,
441: 161,
442: 139,
443: 101,
444: 174,
445: 109,
446: 102,
447: 107,
448: 98,
449: 104,
450: 146,
451: 168,
452: 215,
453: 135,
454: 110,
455: 162,
456: 157,
457: 182,
458: 164,
459: 108,
460: 41,
461: 42,
462: 50,
463: 69,
464: 22,
465: 25,
466: 80,
467: 25,
468: 27,
469: 40,
470: 253,
471: 19,
472: 27,
473: 34,
474: 26,
475: 50,
476: 24,
477: 29,
478: 17,
479: 56,
480: 25,
481: 30,
482: 44,
483: 36,
484: 36,
485: 58,
486: 59,
487: 65,
488: 243,
489: 21,
490: 20,
491: 21,
492: 41,
493: 63,
494: 99,
495: 76,
496: 138,
497: 339,
498: 838,
499: 8129,
500: 263,
501: 118,
502: 39,
503: 28,
504: 40,
505: 151,
506: 35,
507: 17,
508: 20,
509: 27,
510: 61,
511: 22,
512: 11,
513: 15,
514: 11,
515: 12,
516: 45,
517: 12,
518: 11,
519: 19,
520: 28,
521: 17,
522: 30,
523: 125,
524: 223,
525: 120,
526: 50,
527: 11,
528: 9,
529: 13,
530: 130,
531: 14,
532: 27,
533: 27,
534: 46,
535: 46,
536: 37,
537: 147,
538: 54,
539: 84,
540: 175,
541: 17,
542: 44,
543: 82,
544: 132,
545: 67,
546: 42,
547: 90,
548: 39,
549: 15,
550: 147,
551: 123,
552: 42,
553: 27,
554: 11,
555: 30,
556: 29,
557: 10,
558: 17,
559: 17,
560: 30,
561: 20,
562: 21,
563: 11,
564: 11,
565: 12,
566: 9,
567: 4,
568: 7,
569: 10,
570: 5,
571: 8,
572: 10,
573: 14,
574: 23,
575: 16,
576: 14,
577: 17,
578: 24,
579: 32,
580: 89,
581: 47,
582: 74,
583: 82,
584: 127,
585: 113,
586: 140,
587: 128,
588: 120,
589: 99,
590: 98,
591: 73,
592: 79,
593: 50,
594: 80,
595: 69,
596: 40,
597: 32,
598: 38,
599: 43,
600: 12,
601: 17,
602: 8,
603: 2,
604: 9,
605: 7,
606: 8,
607: 7,
608: 3,
609: 4,
610: 1,
611: 2,
612: 3,
613: 2,
615: 5,
616: 1,
617: 2,
618: 4,
619: 4,
620: 2,
621: 1,
622: 1,
623: 2,
624: 1,
625: 1,
627: 1,
628: 3,
629: 2,
630: 1,
631: 1,
632: 8,
633: 65,
634: 7,
635: 4,
636: 5,
637: 4,
639: 3,
640: 4,
641: 2,
642: 3,
643: 2,
644: 1,
646: 2,
647: 1,
649: 1,
650: 6,
651: 9,
652: 1,
653: 3,
654: 3,
655: 6,
656: 6,
657: 158,
658: 26,
659: 922,
660: 1,
661: 1,
663: 3,
664: 3,
665: 2,
666: 3,
667: 3,
668: 10,
669: 41,
670: 97,
671: 2,
672: 5,
673: 6,
674: 1,
675: 1,
676: 2,
677: 2,
678: 1,
679: 1,
680: 2,
683: 2,
684: 1,
685: 1,
686: 2,
687: 2,
688: 1,
689: 4,
690: 11,
691: 21,
692: 6,
693: 1,
694: 3,
695: 2,
696: 1,
697: 3,
698: 4,
699: 2,
701: 4,
702: 1,
703: 2,
705: 1,
707: 2,
708: 2,
710: 2,
711: 2,
713: 1,
714: 1,
716: 1,
717: 3,
719: 1,
725: 1,
726: 1,
727: 1,
733: 1,
740: 1,
746: 5,
759: 1,
763: 1,
768: 2,
770: 2,
771: 1,
775: 2,
785: 1,
806: 1,
811: 1,
819: 1,
822: 1,
831: 1,
847: 1,
857: 1,
862: 1,
864: 1,
867: 2,
868: 1,
870: 1,
871: 1,
873: 1,
881: 1,
882: 1,
888: 1,
896: 1,
897: 2,
900: 3,
902: 1,
905: 5,
906: 2,
907: 2,
908: 1,
910: 1,
912: 2,
913: 1,
914: 2,
915: 1,
917: 1,
919: 1,
920: 1,
923: 1,
927: 1,
929: 1,
931: 2,
933: 1,
934: 1,
937: 1,
938: 3,
939: 2,
940: 1,
943: 2,
944: 1,
945: 4,
946: 2,
947: 118,
949: 5,
950: 70,
951: 3,
952: 3,
954: 1,
955: 1,
957: 1,
959: 1,
960: 1,
961: 2,
962: 5,
963: 7,
964: 12,
965: 16,
966: 3,
967: 6,
968: 1,
969: 2,
971: 7,
972: 59,
973: 8,
974: 5,
975: 8,
976: 70,
977: 12,
978: 45,
979: 16,
980: 38,
981: 441,
982: 8,
983: 115,
984: 171,
985: 122,
986: 40,
987: 286,
988: 163,
989: 128,
990: 86,
991: 157,
992: 96,
993: 172,
994: 82,
995: 89,
996: 200,
997: 96,
998: 245,
999: 1248,
1000: 752,
1001: 433,
1002: 1021,
1003: 3570,
1004: 332,
1005: 352,
1006: 546,
1007: 217,
1008: 209,
1009: 104,
1010: 90,
1011: 55,
1012: 31,
1013: 34,
1014: 10,
1015: 57,
1016: 4,
1017: 9,
1018: 4,
1019: 9,
1021: 3,
1022: 1,
1032: 1})
In [5]:
# Define a function to extract out the protease sequence, even if it's not equal to consensus.
def find_best_match(consensus, query):
"""
Pass in two SeqRecords. The conversion to strings will be done inside the function.
"""
query_str = str(query.seq) # <- this is greater than 99 a.a. long.
min_dist = np.inf
best_pos = 0
for pos in range(len(query_str)):
d_current = distance(str(consensus.seq), str(query_str[pos:pos + len(consensus.seq)]))
if d_current < min_dist:
min_dist = d_current
best_pos = pos
matching_query = SeqRecord(seq=query.seq[best_pos:best_pos+len(consensus.seq)], id=query.id, description=query.description, name=query.name)
return matching_query
In [6]:
cleaned_protease_sequences = Parallel(n_jobs=-1)(delayed(find_best_match)(protease_consensus, s) for s in protease_sequences)
cleaned_protease_sequences = [s for s in cleaned_protease_sequences if len(s.seq) == len(protease_consensus)]
In [7]:
cleaned_rt_sequences = Parallel(n_jobs=-1)(delayed(find_best_match)(rt_consensus, s) for s in rt_sequences)
cleaned_rt_sequences = [s for s in cleaned_rt_sequences if len(s.seq) == len(rt_consensus)]
In [11]:
SeqIO.write(cleaned_protease_sequences, 'sequences/HIV1-protease.fasta', 'fasta')
Out[11]:
156223
In [8]:
SeqIO.write(cleaned_rt_sequences, 'sequences/HIV1-RT.fasta', 'fasta')
Out[8]:
15018
In [9]:
protease_sequences[101236].seq[26:26+99]
Out[9]:
Seq('PQITLWQRPLVTIKIGGQIKEALLDTGADDTVLEDMDLPGRWKPKMIGGIGGFI...LNF', SingleLetterAlphabet())
In [12]:
cleaned_protease_sequences[0:5]
Out[12]:
[SeqRecord(seq=Seq('PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFI...LNF', SingleLetterAlphabet()), id='B.FR.1983.IIIB_LAI.A04321', name='B.FR.1983.IIIB_LAI.A04321', description='B.FR.1983.IIIB_LAI.A04321', dbxrefs=[]),
SeqRecord(seq=Seq('PQITLWQRPLVAIKIGGQLKEALLDTGADDTVLEEMNLPGKWKPKMIGGIGGFI...LNF', SingleLetterAlphabet()), id='D.CD.1983.ELI_patent.A07108', name='D.CD.1983.ELI_patent.A07108', description='D.CD.1983.ELI_patent.A07108', dbxrefs=[]),
SeqRecord(seq=Seq('PQITLWQRPVVTVRVGGQLKEALLDTGADDTVLEEINLPGKWKPKMIGGIGGFI...LNF', SingleLetterAlphabet()), id='A1DK.CD.1985.MAL_patent.A07116', name='A1DK.CD.1985.MAL_patent.A07116', description='A1DK.CD.1985.MAL_patent.A07116', dbxrefs=[]),
SeqRecord(seq=Seq('PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFI...LNF', SingleLetterAlphabet()), id='B.FR.1983.LAI-J19.A07867', name='B.FR.1983.LAI-J19.A07867', description='B.FR.1983.LAI-J19.A07867', dbxrefs=[]),
SeqRecord(seq=Seq('PQITLWQRPLVAIKIGGQLKEALLDTGADDTVLEEMNLPGKWKPKMIGGIGGFI...LNF', SingleLetterAlphabet()), id='D.CD.1983.ELI_patent.A14116', name='D.CD.1983.ELI_patent.A14116', description='D.CD.1983.ELI_patent.A14116', dbxrefs=[])]
In [ ]:
Content source: ericmjl/systems-microbiology-hiv
Similar notebooks: