notebook.community

Edit and run



In [5]:

    
# In this notebook, we are going to look through the HIV-1 protease and reverse transcriptase sequence data. 
# The goal is to determine a strategy for downsampling sequences for phylogenetic tree construction



In [6]:

    
from Bio import SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline



In [7]:

    
proteases = [s for s in SeqIO.parse('sequences/HIV1-protease.fasta', 'fasta')]
len(proteases)









    Out[7]:





156303



In [8]:

    
rts = [s for s in SeqIO.parse('sequences/HIV1-RT.fasta', 'fasta')]
len(rts)









    Out[8]:





15887



In [9]:

    
def extract_metadata(sequences):
    """
    The metadata structure is as such:
    [Subtype].[Country].[Year].[Name].[Accession]
    """
    prot_metadata = []
    for s in sequences:
        metadata = s.id.split('.')
        data = dict()
        data['subtype'] = metadata[0]
        data['country'] = metadata[1]
        data['year'] = metadata[2]
        data['name'] = metadata[3]
        data['accession'] = metadata[4]

        prot_metadata.append(data)

    return pd.DataFrame(prot_metadata).replace('-', np.nan)

rt_metadf = extract_metadata(rts)
protease_metadf = extract_metadata(proteases)



In [10]:

    
rt_metadf.to_csv('csv/RT-all_metadata.csv')
rt_metadf









    Out[10]:






  
    
      
      accession
      country
      name
      subtype
      year
    
  
  
    
      0
      K03455
      FR
      HXB2-LAI-IIIB-BRU
      B
      1983
    
    
      1
      A04321
      FR
      IIIB_LAI
      B
      1983
    
    
      2
      A07108
      CD
      ELI_patent
      D
      1983
    
    
      3
      A07116
      CD
      MAL_patent
      A1DK
      1985
    
    
      4
      A07867
      FR
      LAI-J19
      B
      1983
    
    
      5
      A14116
      CD
      ELI_patent
      D
      1983
    
    
      6
      A34828
      CD
      NDK_patent
      D
      1983
    
    
      7
      AB023804
      IN
      93IN101
      C
      1993
    
    
      8
      AB032740
      TH
      95TNIH022
      01_AE
      1995
    
    
      9
      AB032741
      TH
      95TNIH047
      01_AE
      1995
    
    
      10
      AB049811
      GH
      97GH-AG1
      02_AG
      1997
    
    
      11
      AB052867
      GH
      97GH-AG2
      02A1
      1997
    
    
      12
      AB052995
      JP
      93JP_NH1
      01_AE
      1993
    
    
      13
      AB070352
      JP
      NH25_93JPNH25T_93JP_NH2_5T
      01_AE
      1993
    
    
      14
      AB078005
      US
      ARES2
      B
      1997
    
    
      15
      AB097865
      MM
      mIDU502
      01B
      2000
    
    
      16
      AB097866
      MM
      mCSW503
      01BC
      2000
    
    
      17
      AB097867
      MM
      mCSW104
      01B
      1999
    
    
      18
      AB097868
      MM
      mIDU107
      01BC
      1999
    
    
      19
      AB097869
      MM
      mIDU106
      BC
      1999
    
    
      20
      AB097870
      MM
      mSTD101
      B
      1999
    
    
      21
      AB097871
      MM
      mIDU101_3
      C
      1999
    
    
      22
      AB097872
      MM
      mCSW105
      01A1
      1999
    
    
      23
      AB097873
      MM
      mIDU103
      BC
      1999
    
    
      24
      AB098330
      UG
      UG031
      A1
      NaN
    
    
      25
      AB098331
      UG
      UG031
      A1
      NaN
    
    
      26
      AB098332
      UG
      UG029
      A1
      1992
    
    
      27
      AB098333
      UG
      UG029
      A1
      1992
    
    
      28
      AB220944
      TH
      93TH051
      01_AE
      1993
    
    
      29
      AB220945
      TH
      93TH054
      01_AE
      1993
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      15857
      U52953
      BR
      BR025-d
      C
      1992
    
    
      15858
      U53870
      FR
      PFA660p12_LAI
      B
      1983
    
    
      15859
      U53871
      FR
      PFA660AZT02P29_IIIB
      B
      1983
    
    
      15860
      U54771
      TH
      CM240
      01_AE
      1990
    
    
      15861
      U63632
      US
      JRFL_JR_FL
      B
      1986
    
    
      15862
      U69584
      US
      WCIPR
      B
      1985
    
    
      15863
      U69585
      US
      WCIPR
      B
      1985
    
    
      15864
      U69586
      US
      WCIPR
      B
      1985
    
    
      15865
      U69587
      US
      WCIPR
      B
      1985
    
    
      15866
      U69588
      US
      WCIPR
      B
      1985
    
    
      15867
      U69589
      US
      WCIPR
      B
      1990
    
    
      15868
      U69590
      US
      WCIPR
      B
      1990
    
    
      15869
      U69591
      US
      WCIPR
      B
      1990
    
    
      15870
      U69592
      US
      WCIPR
      B
      1990
    
    
      15871
      U69593
      US
      WCIPR
      B
      1990
    
    
      15872
      U71182
      CN
      RL42
      B
      NaN
    
    
      15873
      U76035
      CD
      Z321_Z321B
      A1GU
      1976
    
    
      15874
      U86780
      ZM
      ZAM184
      A2C
      1990
    
    
      15875
      U88822
      CD
      84ZR085
      D
      1984
    
    
      15876
      U88823
      RW
      92RW009_06
      A1C
      1992
    
    
      15877
      U88824
      UG
      94UG114
      D
      1994
    
    
      15878
      U88825
      NG
      92NG003
      A1G
      1992
    
    
      15879
      U88826
      NG
      92NG083_JV10832
      G
      1992
    
    
      15880
      U92049
      ET
      E3099G
      A1C
      1991
    
    
      15881
      U97171
      ES
      ESP1
      O
      NaN
    
    
      15882
      X01762
      FR
      REHTLV3_LAI_IIIB
      B
      1983
    
    
      15883
      X04415
      CD
      MAL_MALCG
      A1DK
      1985
    
    
      15884
      Z11530
      FR
      F12CG
      B
      NaN
    
    
      15885
      Z37142
      AU
      RTPX
      B
      NaN
    
    
      15886
      Z37143
      AU
      RTPY
      B
      NaN
    
  

15887 rows × 5 columns



In [11]:

    
protease_metadf.to_csv('csv/Protease-all_metadata.csv')
protease_metadf









    Out[11]:






  
    
      
      accession
      country
      name
      subtype
      year
    
  
  
    
      0
      A04321
      FR
      IIIB_LAI
      B
      1983
    
    
      1
      A07108
      CD
      ELI_patent
      D
      1983
    
    
      2
      A07116
      CD
      MAL_patent
      A1DK
      1985
    
    
      3
      A07867
      FR
      LAI-J19
      B
      1983
    
    
      4
      A14116
      CD
      ELI_patent
      D
      1983
    
    
      5
      A34828
      CD
      NDK_patent
      D
      1983
    
    
      6
      AB020911
      JP
      R1
      NaN
      NaN
    
    
      7
      AB020912
      JP
      R2
      NaN
      NaN
    
    
      8
      AB020913
      JP
      R3
      NaN
      NaN
    
    
      9
      AB020914
      JP
      R4
      NaN
      NaN
    
    
      10
      AB020915
      JP
      R5
      NaN
      NaN
    
    
      11
      AB020916
      JP
      NR1
      NaN
      NaN
    
    
      12
      AB020917
      JP
      NR2
      NaN
      NaN
    
    
      13
      AB020918
      JP
      NR3
      NaN
      NaN
    
    
      14
      AB020919
      JP
      NR4
      NaN
      NaN
    
    
      15
      AB020920
      JP
      NR5
      NaN
      NaN
    
    
      16
      AB020921
      JP
      NR6
      NaN
      NaN
    
    
      17
      AB020922
      JP
      NR7_BEFORE
      NaN
      NaN
    
    
      18
      AB020923
      JP
      NR7_6_MONTHS_AFTER
      NaN
      NaN
    
    
      19
      AB020924
      JP
      NR8_BEFORE
      NaN
      NaN
    
    
      20
      AB020925
      JP
      NR8_8_MONTHS_AFTER
      NaN
      NaN
    
    
      21
      AB023804
      IN
      93IN101
      C
      1993
    
    
      22
      AB025113
      JP
      1_970121
      B
      1997
    
    
      23
      AB025114
      JP
      1_970616
      B
      1997
    
    
      24
      AB025115
      JP
      1_970616
      B
      1997
    
    
      25
      AB025116
      JP
      1_970828
      B
      1997
    
    
      26
      AB025117
      JP
      1_970828
      B
      1997
    
    
      27
      AB025118
      JP
      1_971009
      B
      1997
    
    
      28
      AB025119
      JP
      1_971113
      B
      1997
    
    
      29
      AB025120
      JP
      1_971113
      B
      1997
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      156273
      U72022
      US
      PTUW52
      B
      NaN
    
    
      156274
      U72023
      US
      PTUW52
      B
      NaN
    
    
      156275
      U72024
      US
      PTUW52
      B
      NaN
    
    
      156276
      U72025
      US
      PTUW52
      B
      NaN
    
    
      156277
      U72026
      US
      PTUW52
      B
      NaN
    
    
      156278
      U76035
      CD
      Z321_Z321B
      A1GU
      1976
    
    
      156279
      U78097
      BR
      BHPROT_10_1
      B
      NaN
    
    
      156280
      U83698
      BR
      BR19-pol
      B
      NaN
    
    
      156281
      U83699
      BR
      BR20-pol
      B
      NaN
    
    
      156282
      U83700
      BR
      BR30-pol
      B
      NaN
    
    
      156283
      U86780
      ZM
      ZAM184
      A2C
      1990
    
    
      156284
      U88822
      CD
      84ZR085
      D
      1984
    
    
      156285
      U88823
      RW
      92RW009_06
      A1C
      1992
    
    
      156286
      U88824
      UG
      94UG114
      D
      1994
    
    
      156287
      U88825
      NG
      92NG003
      A1G
      1992
    
    
      156288
      U88826
      NG
      92NG083_JV10832
      G
      1992
    
    
      156289
      U92049
      ET
      E3099G
      A1C
      1991
    
    
      156290
      X01762
      FR
      REHTLV3_LAI_IIIB
      B
      1983
    
    
      156291
      X04415
      CD
      MAL_MALCG
      A1DK
      1985
    
    
      156292
      Y14496
      FR
      BCF01_pol
      O
      NaN
    
    
      156293
      Y14497
      FR
      BCF02_pol
      O
      NaN
    
    
      156294
      Y14498
      FR
      BCF03_pol
      O
      1990
    
    
      156295
      Y14499
      FR
      BCF06_pol
      O
      1994
    
    
      156296
      Y14500
      FR
      BCF07_prot
      O
      NaN
    
    
      156297
      Y14501
      FR
      BCF08
      O
      NaN
    
    
      156298
      Y14502
      FR
      BCF11_pol
      O
      NaN
    
    
      156299
      Y14503
      FR
      BCF13
      O
      1995
    
    
      156300
      Y14504
      FR
      RUD
      O
      NaN
    
    
      156301
      Y14505
      FR
      VAU_HAM
      O
      1992
    
    
      156302
      Z11530
      FR
      F12CG
      B
      NaN
    
  

156303 rows × 5 columns



In [12]:

    
rt_metadf['year'].value_counts().plot(kind='bar')
rt_metadf['year'].value_counts().to_csv('csv/RT-num_per_year.csv')



In [13]:

    
fig = plt.figure(figsize=(15,3))
rt_metadf['country'].value_counts().plot(kind='bar')
rt_metadf['country'].value_counts().to_csv('csv/RT-num_per_country.csv')



In [14]:

    
protease_metadf['year'].value_counts().plot(kind='bar')
protease_metadf['year'].value_counts().to_csv('csv/Protease-num_per_year.csv')



In [15]:

    
fig = plt.figure(figsize=(15,3))
protease_metadf['country'].value_counts().plot(kind='bar')
protease_metadf['country'].value_counts().to_csv('csv/Protease-num_per_country.csv')

Downsampling Notes

After discussion with Nichola about the downsampling strategy, we have settled on the following:

Use sequences from the years 2003-2007 inclusive.
Use sequences from the following countries: US, BR, JP, ZA, ES



In [23]:

    
# Code for downsampling.
# Recall that the metadata structure is as such:
# 
#     [Subtype].[Country].[Year].[Name].[Accession]
# 
# We will use a dictionary to store the downsampled sequences.

import numpy as np
from collections import defaultdict
from itertools import product

years = np.arange(2003, 2008, 1)
countries = ['US', 'BR', 'JP', 'ZA', 'ES']

proteases_grouped = defaultdict()
for year, country in product(years, countries):
    proteases_grouped[(year, country)] = []

# Group the sequences first.
for s in proteases:
    country = s.id.split('.')[1]
    try:
        year = int(s.id.split('.')[2])
    except ValueError:
        year = 0
    if country in countries and year in years:
        proteases_grouped[(year, country)].append(s)



In [38]:

    
import random

random.seed(1) # for reproducibility

# Perform the downsampling
proteases_downsampled = defaultdict(list)
for k, v in proteases_grouped.items():
    proteases_downsampled[k] = random.sample(v, 10)
    
# Write the downsampled sequences to disk.
protease_sequences = []
for k, v in proteases_downsampled.items():
    protease_sequences.extend(v)
    
SeqIO.write(protease_sequences, 'sequences/proteases_downsampled.fasta', 'fasta')









    Out[38]:





250



In [ ]:



In [ ]:

	accession	country	name	subtype	year
0	K03455	FR	HXB2-LAI-IIIB-BRU	B	1983
1	A04321	FR	IIIB_LAI	B	1983
2	A07108	CD	ELI_patent	D	1983
3	A07116	CD	MAL_patent	A1DK	1985
4	A07867	FR	LAI-J19	B	1983
5	A14116	CD	ELI_patent	D	1983
6	A34828	CD	NDK_patent	D	1983
7	AB023804	IN	93IN101	C	1993
8	AB032740	TH	95TNIH022	01_AE	1995
9	AB032741	TH	95TNIH047	01_AE	1995
10	AB049811	GH	97GH-AG1	02_AG	1997
11	AB052867	GH	97GH-AG2	02A1	1997
12	AB052995	JP	93JP_NH1	01_AE	1993
13	AB070352	JP	NH25_93JPNH25T_93JP_NH2_5T	01_AE	1993
14	AB078005	US	ARES2	B	1997
15	AB097865	MM	mIDU502	01B	2000
16	AB097866	MM	mCSW503	01BC	2000
17	AB097867	MM	mCSW104	01B	1999
18	AB097868	MM	mIDU107	01BC	1999
19	AB097869	MM	mIDU106	BC	1999
20	AB097870	MM	mSTD101	B	1999
21	AB097871	MM	mIDU101_3	C	1999
22	AB097872	MM	mCSW105	01A1	1999
23	AB097873	MM	mIDU103	BC	1999
24	AB098330	UG	UG031	A1	NaN
25	AB098331	UG	UG031	A1	NaN
26	AB098332	UG	UG029	A1	1992
27	AB098333	UG	UG029	A1	1992
28	AB220944	TH	93TH051	01_AE	1993
29	AB220945	TH	93TH054	01_AE	1993
...	...	...	...	...	...
15857	U52953	BR	BR025-d	C	1992
15858	U53870	FR	PFA660p12_LAI	B	1983
15859	U53871	FR	PFA660AZT02P29_IIIB	B	1983
15860	U54771	TH	CM240	01_AE	1990
15861	U63632	US	JRFL_JR_FL	B	1986
15862	U69584	US	WCIPR	B	1985
15863	U69585	US	WCIPR	B	1985
15864	U69586	US	WCIPR	B	1985
15865	U69587	US	WCIPR	B	1985
15866	U69588	US	WCIPR	B	1985
15867	U69589	US	WCIPR	B	1990
15868	U69590	US	WCIPR	B	1990
15869	U69591	US	WCIPR	B	1990
15870	U69592	US	WCIPR	B	1990
15871	U69593	US	WCIPR	B	1990
15872	U71182	CN	RL42	B	NaN
15873	U76035	CD	Z321_Z321B	A1GU	1976
15874	U86780	ZM	ZAM184	A2C	1990
15875	U88822	CD	84ZR085	D	1984
15876	U88823	RW	92RW009_06	A1C	1992
15877	U88824	UG	94UG114	D	1994
15878	U88825	NG	92NG003	A1G	1992
15879	U88826	NG	92NG083_JV10832	G	1992
15880	U92049	ET	E3099G	A1C	1991
15881	U97171	ES	ESP1	O	NaN
15882	X01762	FR	REHTLV3_LAI_IIIB	B	1983
15883	X04415	CD	MAL_MALCG	A1DK	1985
15884	Z11530	FR	F12CG	B	NaN
15885	Z37142	AU	RTPX	B	NaN
15886	Z37143	AU	RTPY	B	NaN