Data from "Single-nucleotide polymorphism discovery and panel characterization in the African forest elephant"
Ptychadena
Peromyscus (Munshi-South)
Chinese sea bass (SRP094869)
In [29]:
import ipyrad as ip
import ipyrad.analysis as ipa
import ipyparallel as ipp
import pandas as pd
import toyplot
import glob
import gzip
## Ptychadena (Stephane)
## * http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0190440#sec022
## Peromyscus (Munshi-South)
## Chinese sea bass (SRP094869)
## Population Genomics Reveals Genetic Divergence and Adaptive Differentiation of Chinese Sea Bass (Lateolabrax maculatus)
## * https://link.springer.com/article/10.1007/s10126-017-9786-0#Sec2
In [ ]:
%%bash
conda install -c eaton-lab toytree
conda install -c bioconda sra-tools entrez-direct
In [ ]:
%%bash
## Fetch the elephant reference genome
mkdir ref
wget ftp://ftp.broadinstitute.org/pub/assemblies/mammals/elephant/loxAfr3/assembly_supers.fasta.gz -o ref/assembly_supers.fasta.gz
In [ ]:
%%bash
## Data from "Single-nucleotide polymorphism discovery and panel characterization in the African forest elephant"
## http://onlinelibrary.wiley.com/doi/10.1002/ece3.3854/full?wol1URL=/doi/10.1002/ece3.3854/full®ionCode=US-NY&identityKey=34b23ec9-4666-4c37-8470-8448e64d6167
ipyrad --download SRP126637 raws/
In [39]:
## R1 and R2 were concatenated in the SRA data files so we have to pull them apart.
## This is probably not the most efficient way to do this, but it works.
raws = glob.glob("raws/*")
print(raws)
for r in raws:
name = r.split("/")[1].split("_")[0]
lines = gzip.open(r).readlines()
nlines = len(lines)/2
print("{} {}".format(name, nlines))
with gzip.open("raws/{}_R1_.fastq.gz".format(name), 'wb') as r1:
r1.write("".join(lines[:nlines]))
with gzip.open("raws/{}_R2_.fastq.gz".format(name), 'wb') as r2:
r2.write("".join(lines[nlines:]))
Fetching project data...
Run spots mates ScientificName SampleName
0 SRR6371502 241196 0 Loxodonta cyclotis LOC0127
1 SRR6371503 1133408 0 Loxodonta cyclotis LOC0122
2 SRR6371505 1002779 0 Loxodonta cyclotis LOC0151
3 SRR6371506 1582988 0 Loxodonta cyclotis LOC0274
4 SRR6371507 2519228 0 Loxodonta cyclotis LOC0263
5 SRR6371508 1002140 0 Loxodonta cyclotis LOC0309
6 SRR6371509 2894868 0 Loxodonta cyclotis LOC0279
7 SRR6371510 1190860 0 Loxodonta cyclotis LOC0311
8 SRR6371511 267664 0 Loxodonta cyclotis LOC0310
9 SRR6371512 758060 0 Loxodonta cyclotis LOC0040
10 SRR6371513 2176494 0 Loxodonta cyclotis LOC0038
11 SRR6371514 2319874 0 Loxodonta cyclotis LOC0037
12 SRR6371515 906060 0 Loxodonta cyclotis LOC0035
13 SRR6371516 1133648 0 Loxodonta cyclotis LOC0051
14 SRR6371517 1816948 0 Loxodonta cyclotis LOC0050
15 SRR6371518 1471242 0 Loxodonta cyclotis LOC0049
16 SRR6371519 1366528 0 Loxodonta cyclotis LOC0041
17 SRR6371520 225068 0 Loxodonta cyclotis LOC0121
18 SRR6371521 1266382 0 Loxodonta cyclotis LOC0088
In [37]:
df = pd.DataFrame([x.split("\t") for x in """SRR6371521 SAMN08167496 LOC0088 WG011 178 95 SRX3466825 LOC0088_a Lope
SRR6371520 SAMN08167497 LOC0121 WG012 31 16 SRX3466826 LOC0121_a Minkebe
SRR6371519 SAMN08167492 LOC0041 WG005 192 102 SRX3466827 LOC0041_a Waka
SRR6371518 SAMN08167493 LOC0049 WG008 207 110 SRX3466828 LOC0049_a Ivindo
SRR6371517 SAMN08167494 LOC0050 WG009 256 135 SRX3466829 LOC0050_b Ivindo
SRR6371516 SAMN08167495 LOC0051 WG010 160 85 SRX3466830 LOC0051_a Ivindo
SRR6371515 SAMN08167488 LOC0035 WG001 127 69 SRX3466831 LOC0035_a Minkebe
SRR6371514 SAMN08167489 LOC0037 WG002 327 175 SRX3466832 LOC0037_a Lope
SRR6371513 SAMN08167490 LOC0038 WG003 307 164 SRX3466833 LOC0038_a Lope
SRR6371512 SAMN08167491 LOC0040 WG004 106 56 SRX3466834 LOC0040_a Wonga Wongue
SRR6371511 SAMN08167506 LOC0310 WG023 37 20 SRX3466835 LOC0310_a Moukalaba Doudou
SRR6371510 SAMN08167507 LOC0311 WG024 168 89 SRX3466836 LOC0311_a Monts de Cristal
SRR6371509 SAMN08167504 LOC0279 WG020 408 215 SRX3466837 LOC0279_b South Mulundu
SRR6371508 SAMN08167505 LOC0309 WG022 141 75 SRX3466838 LOC0309_a Mayumba
SRR6371507 SAMN08167502 LOC0263 WG018 355 188 SRX3466839 LOC0263_a Wonga Wongue
SRR6371506 SAMN08167503 LOC0274 WG019 223 117 SRX3466840 LOC0274_a Loango
SRR6371505 SAMN08167500 LOC0151 WG015 141 71 SRX3466841 LOC0151_a Moukalaba Doudou
SRR6371503 SAMN08167498 LOC0122 WG013 159 85 SRX3466843 LOC0122_a Minkebe
SRR6371502 SAMN08167499 LOC0127 WG014 34 18 SRX3466844 LOC0127_a Moukalaba Doudou""".split("\n")])
## Just get the sequence identifier, the sample name, and the sample location
df = df.iloc[:, [0,7,8]]
Launch the cluster by running this command on the command line:
ipcluster start --n=19 --daemonize
In [5]:
## connect to cluster and check if all the engines are running
ipyclient = ipp.Client()
ipyclient.ids
Out[5]:
In [19]:
data = ip.Assembly("Loxodonta")
In [37]:
## set parameters
data.set_params("project_dir", "ddrad-denovo")
data.set_params("sorted_fastq_path", "raws/*_.fastq")
data.set_params("datatype", "pairddrad")
data.set_params("restriction_overhang", ("TGCAG", "CATGC"))
data.set_params("clust_threshold", "0.90")
data.set_params("filter_adapters", "2")
data.set_params("max_Hs_consens", (5, 5))
data.set_params("trim_loci", (0, 0, 0, 0))
data.set_params("output_formats", "*")
## see/print all parameters
data.get_params()
data.write_params(force=True)
In [40]:
data.run("12")
In [41]:
## access the stats of the assembly (so far) from the .stats attribute
data.stats
Out[41]:
In [42]:
## run steps 3-6 of the assembly
data.run("34567")
In [7]:
kvalues = [2, 3, 4, 5, 6]
s = ipa.structure(
name="quick",
workdir="./analysis-structure",
data="./ddrad-denovo/Loxodonta_outfiles/Loxodonta.ustr",
)
## set main params (use much larger values in a real analysis)
s.mainparams.burnin = 1000
s.mainparams.numreps = 5000
## submit N replicates of each test to run on parallel client
for kpop in kvalues:
s.run(kpop=kpop, nreps=4, ipyclient=ipyclient)
## wait for parallel jobs to finish
ipyclient.wait()
Out[7]:
In [8]:
## return the evanno table (deltaK) for best K
etable = s.get_evanno_table(kvalues)
etable
Out[8]:
In [17]:
## set some clumpp params
s.clumppparams.m = 3 ## use largegreedy algorithm
s.clumppparams.greedy_option = 2 ## test nrepeat possible orders
s.clumppparams.repeats = 10000 ## number of repeats
s.clumppparams
## run clumpp for each value of K
#tables = s.get_clumpp_table(kvalues, quiet=True)
print(tables)
#table = tables[4].sort_values(by=[0, 1, 2, 3])
table = tables[2].sort_values(by=[0, 1])
toyplot.bars(
table,
width=500,
height=200,
title=[[i] for i in table.index.tolist()],
xshow=False,
);
print(table)