In [5]:
# In this notebook, we are going to look through the HIV-1 protease and reverse transcriptase sequence data. 
# The goal is to determine a strategy for downsampling sequences for phylogenetic tree construction

In [6]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [7]:
proteases = [s for s in SeqIO.parse('sequences/HIV1-protease.fasta', 'fasta')]
len(proteases)


Out[7]:
156303

In [8]:
rts = [s for s in SeqIO.parse('sequences/HIV1-RT.fasta', 'fasta')]
len(rts)


Out[8]:
15887

In [9]:
def extract_metadata(sequences):
    """
    The metadata structure is as such:
    [Subtype].[Country].[Year].[Name].[Accession]
    """
    prot_metadata = []
    for s in sequences:
        metadata = s.id.split('.')
        data = dict()
        data['subtype'] = metadata[0]
        data['country'] = metadata[1]
        data['year'] = metadata[2]
        data['name'] = metadata[3]
        data['accession'] = metadata[4]

        prot_metadata.append(data)

    return pd.DataFrame(prot_metadata).replace('-', np.nan)

rt_metadf = extract_metadata(rts)
protease_metadf = extract_metadata(proteases)

In [10]:
rt_metadf.to_csv('csv/RT-all_metadata.csv')
rt_metadf


Out[10]:
accession country name subtype year
0 K03455 FR HXB2-LAI-IIIB-BRU B 1983
1 A04321 FR IIIB_LAI B 1983
2 A07108 CD ELI_patent D 1983
3 A07116 CD MAL_patent A1DK 1985
4 A07867 FR LAI-J19 B 1983
5 A14116 CD ELI_patent D 1983
6 A34828 CD NDK_patent D 1983
7 AB023804 IN 93IN101 C 1993
8 AB032740 TH 95TNIH022 01_AE 1995
9 AB032741 TH 95TNIH047 01_AE 1995
10 AB049811 GH 97GH-AG1 02_AG 1997
11 AB052867 GH 97GH-AG2 02A1 1997
12 AB052995 JP 93JP_NH1 01_AE 1993
13 AB070352 JP NH25_93JPNH25T_93JP_NH2_5T 01_AE 1993
14 AB078005 US ARES2 B 1997
15 AB097865 MM mIDU502 01B 2000
16 AB097866 MM mCSW503 01BC 2000
17 AB097867 MM mCSW104 01B 1999
18 AB097868 MM mIDU107 01BC 1999
19 AB097869 MM mIDU106 BC 1999
20 AB097870 MM mSTD101 B 1999
21 AB097871 MM mIDU101_3 C 1999
22 AB097872 MM mCSW105 01A1 1999
23 AB097873 MM mIDU103 BC 1999
24 AB098330 UG UG031 A1 NaN
25 AB098331 UG UG031 A1 NaN
26 AB098332 UG UG029 A1 1992
27 AB098333 UG UG029 A1 1992
28 AB220944 TH 93TH051 01_AE 1993
29 AB220945 TH 93TH054 01_AE 1993
... ... ... ... ... ...
15857 U52953 BR BR025-d C 1992
15858 U53870 FR PFA660p12_LAI B 1983
15859 U53871 FR PFA660AZT02P29_IIIB B 1983
15860 U54771 TH CM240 01_AE 1990
15861 U63632 US JRFL_JR_FL B 1986
15862 U69584 US WCIPR B 1985
15863 U69585 US WCIPR B 1985
15864 U69586 US WCIPR B 1985
15865 U69587 US WCIPR B 1985
15866 U69588 US WCIPR B 1985
15867 U69589 US WCIPR B 1990
15868 U69590 US WCIPR B 1990
15869 U69591 US WCIPR B 1990
15870 U69592 US WCIPR B 1990
15871 U69593 US WCIPR B 1990
15872 U71182 CN RL42 B NaN
15873 U76035 CD Z321_Z321B A1GU 1976
15874 U86780 ZM ZAM184 A2C 1990
15875 U88822 CD 84ZR085 D 1984
15876 U88823 RW 92RW009_06 A1C 1992
15877 U88824 UG 94UG114 D 1994
15878 U88825 NG 92NG003 A1G 1992
15879 U88826 NG 92NG083_JV10832 G 1992
15880 U92049 ET E3099G A1C 1991
15881 U97171 ES ESP1 O NaN
15882 X01762 FR REHTLV3_LAI_IIIB B 1983
15883 X04415 CD MAL_MALCG A1DK 1985
15884 Z11530 FR F12CG B NaN
15885 Z37142 AU RTPX B NaN
15886 Z37143 AU RTPY B NaN

15887 rows × 5 columns


In [11]:
protease_metadf.to_csv('csv/Protease-all_metadata.csv')
protease_metadf


Out[11]:
accession country name subtype year
0 A04321 FR IIIB_LAI B 1983
1 A07108 CD ELI_patent D 1983
2 A07116 CD MAL_patent A1DK 1985
3 A07867 FR LAI-J19 B 1983
4 A14116 CD ELI_patent D 1983
5 A34828 CD NDK_patent D 1983
6 AB020911 JP R1 NaN NaN
7 AB020912 JP R2 NaN NaN
8 AB020913 JP R3 NaN NaN
9 AB020914 JP R4 NaN NaN
10 AB020915 JP R5 NaN NaN
11 AB020916 JP NR1 NaN NaN
12 AB020917 JP NR2 NaN NaN
13 AB020918 JP NR3 NaN NaN
14 AB020919 JP NR4 NaN NaN
15 AB020920 JP NR5 NaN NaN
16 AB020921 JP NR6 NaN NaN
17 AB020922 JP NR7_BEFORE NaN NaN
18 AB020923 JP NR7_6_MONTHS_AFTER NaN NaN
19 AB020924 JP NR8_BEFORE NaN NaN
20 AB020925 JP NR8_8_MONTHS_AFTER NaN NaN
21 AB023804 IN 93IN101 C 1993
22 AB025113 JP 1_970121 B 1997
23 AB025114 JP 1_970616 B 1997
24 AB025115 JP 1_970616 B 1997
25 AB025116 JP 1_970828 B 1997
26 AB025117 JP 1_970828 B 1997
27 AB025118 JP 1_971009 B 1997
28 AB025119 JP 1_971113 B 1997
29 AB025120 JP 1_971113 B 1997
... ... ... ... ... ...
156273 U72022 US PTUW52 B NaN
156274 U72023 US PTUW52 B NaN
156275 U72024 US PTUW52 B NaN
156276 U72025 US PTUW52 B NaN
156277 U72026 US PTUW52 B NaN
156278 U76035 CD Z321_Z321B A1GU 1976
156279 U78097 BR BHPROT_10_1 B NaN
156280 U83698 BR BR19-pol B NaN
156281 U83699 BR BR20-pol B NaN
156282 U83700 BR BR30-pol B NaN
156283 U86780 ZM ZAM184 A2C 1990
156284 U88822 CD 84ZR085 D 1984
156285 U88823 RW 92RW009_06 A1C 1992
156286 U88824 UG 94UG114 D 1994
156287 U88825 NG 92NG003 A1G 1992
156288 U88826 NG 92NG083_JV10832 G 1992
156289 U92049 ET E3099G A1C 1991
156290 X01762 FR REHTLV3_LAI_IIIB B 1983
156291 X04415 CD MAL_MALCG A1DK 1985
156292 Y14496 FR BCF01_pol O NaN
156293 Y14497 FR BCF02_pol O NaN
156294 Y14498 FR BCF03_pol O 1990
156295 Y14499 FR BCF06_pol O 1994
156296 Y14500 FR BCF07_prot O NaN
156297 Y14501 FR BCF08 O NaN
156298 Y14502 FR BCF11_pol O NaN
156299 Y14503 FR BCF13 O 1995
156300 Y14504 FR RUD O NaN
156301 Y14505 FR VAU_HAM O 1992
156302 Z11530 FR F12CG B NaN

156303 rows × 5 columns


In [12]:
rt_metadf['year'].value_counts().plot(kind='bar')
rt_metadf['year'].value_counts().to_csv('csv/RT-num_per_year.csv')



In [13]:
fig = plt.figure(figsize=(15,3))
rt_metadf['country'].value_counts().plot(kind='bar')
rt_metadf['country'].value_counts().to_csv('csv/RT-num_per_country.csv')



In [14]:
protease_metadf['year'].value_counts().plot(kind='bar')
protease_metadf['year'].value_counts().to_csv('csv/Protease-num_per_year.csv')



In [15]:
fig = plt.figure(figsize=(15,3))
protease_metadf['country'].value_counts().plot(kind='bar')
protease_metadf['country'].value_counts().to_csv('csv/Protease-num_per_country.csv')


Downsampling Notes

After discussion with Nichola about the downsampling strategy, we have settled on the following:

  • Use sequences from the years 2003-2007 inclusive.
  • Use sequences from the following countries: US, BR, JP, ZA, ES

In [23]:
# Code for downsampling.
# Recall that the metadata structure is as such:
# 
#     [Subtype].[Country].[Year].[Name].[Accession]
# 
# We will use a dictionary to store the downsampled sequences.

import numpy as np
from collections import defaultdict
from itertools import product

years = np.arange(2003, 2008, 1)
countries = ['US', 'BR', 'JP', 'ZA', 'ES']

proteases_grouped = defaultdict()
for year, country in product(years, countries):
    proteases_grouped[(year, country)] = []

# Group the sequences first.
for s in proteases:
    country = s.id.split('.')[1]
    try:
        year = int(s.id.split('.')[2])
    except ValueError:
        year = 0
    if country in countries and year in years:
        proteases_grouped[(year, country)].append(s)

In [38]:
import random

random.seed(1) # for reproducibility

# Perform the downsampling
proteases_downsampled = defaultdict(list)
for k, v in proteases_grouped.items():
    proteases_downsampled[k] = random.sample(v, 10)
    
# Write the downsampled sequences to disk.
protease_sequences = []
for k, v in proteases_downsampled.items():
    protease_sequences.extend(v)
    
SeqIO.write(protease_sequences, 'sequences/proteases_downsampled.fasta', 'fasta')


Out[38]:
250

In [ ]:


In [ ]: