ipyrad-analysis toolkit: distance

Key features:

Calculate pairwise genetic distances between samples.
Filter SNPs to reduce missing data.
Impute missing data using population allele frequencies.

required software



In [1]:

    
# conda install ipyrad -c bioconda
# conda install toyplot -c eaton-lab (optional)



In [2]:

    
import ipyrad.analysis as ipa
import toyplot

Short tutorial

Setup input files and params



In [3]:

    
# the path to your VCF or HDF5 formatted snps file
data = "/home/deren/Downloads/ref_pop2.snps.hdf5"



In [4]:

    
# group individuals into populations
imap = {
    "virg": ["TXWV2", "LALC2", "SCCU3", "FLSF33", "FLBA140"],
    "mini": ["FLSF47", "FLMO62", "FLSA185", "FLCK216"],
    "gemi": ["FLCK18", "FLSF54", "FLWO6", "FLAB109"],
    "bran": ["BJSL25", "BJSB3", "BJVL19"],
    "fusi": ["MXED8", "MXGT4", "TXGR3", "TXMD3"],
    "sagr": ["CUVN10", "CUCA4", "CUSV6", "CUMM5"],
    "oleo": ["CRL0030", "CRL0001", "HNDA09", "BZBB1", "MXSA3017"],
}

# minimum n samples that must be present in each SNP from each group
minmap = {i: 0.5 for i in imap}

calculate distances



In [5]:

    
# load the snp data into distance tool with arguments
from ipyrad.analysis.distance import Distance
dist = Distance(
    data=data, 
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method="sample",
    subsample_snps=False,
)
dist.run()









    



Samples: 29
Sites before filtering: 349914
Filtered (indels): 0
Filtered (bi-allel): 13379
Filtered (mincov): 30459
Filtered (minmap): 111825
Filtered (combined): 120177
Sites after filtering: 229737
Sites containing missing values: 219551 (95.57%)
Missing values in SNP matrix: 814369 (12.22%)
Imputation: 'sampled'; (0, 1, 2) = 77.3%, 10.7%, 12.0%

save results



In [6]:

    
# save to a CSV file
dist.dists.to_csv("distances.csv")



In [7]:

    
# show the upper corner 
dist.dists.head()









    Out[7]:







  
    
      
      BJSB3
      BJSL25
      BJVL19
      BZBB1
      CRL0001
      CRL0030
      CUCA4
      CUMM5
      CUSV6
      CUVN10
      ...
      FLWO6
      HNDA09
      LALC2
      MXED8
      MXGT4
      MXSA3017
      SCCU3
      TXGR3
      TXMD3
      TXWV2
    
  
  
    
      BJSB3
      0.000000
      0.250447
      0.253472
      0.592255
      0.530145
      0.572576
      0.601853
      0.597044
      0.591990
      0.579937
      ...
      0.594005
      0.582000
      0.568137
      0.464618
      0.443942
      0.579789
      0.603638
      0.487945
      0.487936
      0.590440
    
    
      BJSL25
      0.250447
      0.000000
      0.235900
      0.558630
      0.494291
      0.537193
      0.566156
      0.559675
      0.554665
      0.542768
      ...
      0.558769
      0.548897
      0.532239
      0.435050
      0.412694
      0.547182
      0.567323
      0.453606
      0.457105
      0.554882
    
    
      BJVL19
      0.253472
      0.235900
      0.000000
      0.567897
      0.502775
      0.547391
      0.576355
      0.569360
      0.563000
      0.554621
      ...
      0.564728
      0.555927
      0.539913
      0.441844
      0.417060
      0.556449
      0.575336
      0.464118
      0.465476
      0.562278
    
    
      BZBB1
      0.592255
      0.558630
      0.567897
      0.000000
      0.280691
      0.280569
      0.422670
      0.422962
      0.426266
      0.394242
      ...
      0.559152
      0.285883
      0.532918
      0.525701
      0.542381
      0.317450
      0.576455
      0.552554
      0.551579
      0.563571
    
    
      CRL0001
      0.530145
      0.494291
      0.502775
      0.280691
      0.000000
      0.239596
      0.347859
      0.322277
      0.342836
      0.213266
      ...
      0.470064
      0.262217
      0.451717
      0.466429
      0.477764
      0.299538
      0.492224
      0.487327
      0.484123
      0.482726
    
  

5 rows × 29 columns

Draw the matrix



In [8]:

    
toyplot.matrix(
    dist.dists, 
    bshow=False,
    tshow=False,
    rlocator=toyplot.locator.Explicit(
        range(len(dist.names)),
        sorted(dist.names),
));

Draw matrix reordered to match groups in imap



In [9]:

    
# get list of concatenated names from each group
ordered_names = []
for group in dist.imap.values():
    ordered_names += group

# reorder matrix to match name order    
ordered_matrix = dist.dists[ordered_names].T[ordered_names]



In [10]:

    
toyplot.matrix(
    ordered_matrix,
    bshow=False,
    tshow=False,
    rlocator=toyplot.locator.Explicit(
        range(len(ordered_names)),
        ordered_names,
));

	BJSB3	BJSL25	BJVL19	BZBB1	CRL0001	CRL0030	CUCA4	CUMM5	CUSV6	CUVN10	...	FLWO6	HNDA09	LALC2	MXED8	MXGT4	MXSA3017	SCCU3	TXGR3	TXMD3	TXWV2
BJSB3	0.000000	0.250447	0.253472	0.592255	0.530145	0.572576	0.601853	0.597044	0.591990	0.579937	...	0.594005	0.582000	0.568137	0.464618	0.443942	0.579789	0.603638	0.487945	0.487936	0.590440
BJSL25	0.250447	0.000000	0.235900	0.558630	0.494291	0.537193	0.566156	0.559675	0.554665	0.542768	...	0.558769	0.548897	0.532239	0.435050	0.412694	0.547182	0.567323	0.453606	0.457105	0.554882
BJVL19	0.253472	0.235900	0.000000	0.567897	0.502775	0.547391	0.576355	0.569360	0.563000	0.554621	...	0.564728	0.555927	0.539913	0.441844	0.417060	0.556449	0.575336	0.464118	0.465476	0.562278
BZBB1	0.592255	0.558630	0.567897	0.000000	0.280691	0.280569	0.422670	0.422962	0.426266	0.394242	...	0.559152	0.285883	0.532918	0.525701	0.542381	0.317450	0.576455	0.552554	0.551579	0.563571
CRL0001	0.530145	0.494291	0.502775	0.280691	0.000000	0.239596	0.347859	0.322277	0.342836	0.213266	...	0.470064	0.262217	0.451717	0.466429	0.477764	0.299538	0.492224	0.487327	0.484123	0.482726