In [1]:
    
### Notebook 1
### Data set 1 (Viburnum)
### Language: Bash
### Data Location: NCBI SRA PRJNA299402 & PRJNA299407
    
    
In [3]:
    
%%bash
## make a new directory for this analysis
mkdir -p empirical_1/
mkdir -p empirical_1/halfrun
mkdir -p empirical_1/fullrun
    
    
In [87]:
    
## import Python libraries
import pandas as pd
import numpy as np
import ipyparallel
import urllib2
import glob
import os
    
Sequence data for this study is archived on the NCBI sequence read archive (SRA). The data were run in two separate Illumina runs, but are combined under a single project id number.
The library contains 95 samples. We uploaded the two demultiplexed samples for each individual separately, so each sample has 2 files. Below we examine just the first library (the "half" data set) and then both libraries combined (the "full" data set). We analyze on 64 samples since the remaining samples are replicate individuals within species that are part of a separate project.
You can download the data set using the script below:
In [ ]:
    
%%bash
## get the data from Dryad
for run in $(seq 24 28);
  do
wget -q -r -nH --cut-dirs=9 \
ftp://ftp-trace.ncbi.nlm.nih.gov/\
sra/sra-instant/reads/ByRun/sra/SRR/\
SRR191/SRR19155$run/SRR19155$run".sra";
done
    
In [ ]:
    
%%bash
## convert sra files to fastq using fastq-dump tool
fastq-dump *.sra
    
In [ ]:
    
## IPython code
## This reads in a table mapping sample names to SRA numbers
## that is hosted on github
## open table from github url
url = "https://raw.githubusercontent.com/"+\
      "dereneaton/virentes/master/SraRunTable.txt"
intable = urllib2.urlopen(url)
## make name xfer dictionary
DF = pd.read_table(intable, sep="\t")
D = {DF.Run_s[i]:DF.Library_Name_s[i] for i in DF.index}
## change file names and move to fastq dir/
for fname in glob.glob("*.fastq"):
    os.rename(fname, "analysis_pyrad/fastq/"+\
                     D[fname.replace(".fastq",".fq")])
    
In [1]:
    
%%bash
mkdir -p fastq_combined
    
In [26]:
    
## IPython code that makes a bash call w/ (!)
## get all the data from the two libraries and concatenate it
lib1tax = glob.glob("/home/deren/Documents/Vib_Lib1/fastq_Lib1/*.gz")
lib2tax = glob.glob("/home/deren/Documents/Vib_Lib1/fastq_Lib2/*.gz")
## names had to be modified to match
taxa = [i.split("/")[-1].split("_", 1)[1] for i in lib1tax]
for tax in taxa:
    ! cat /home/deren/Documents/Vib_Lib1/fastq_Lib1/Lib1_$tax \
          /home/deren/Documents/Vib_Lib1/fastq_Lib2/Lib2_$tax \
          > /home/deren/Documents/Vib_Lib1/fastq_combined/$tax
    
    
In [4]:
    
%%bash
pyrad --version
    
    
In [5]:
    
%%bash
## create a new default params file
rm params.txt
pyrad -n
    
    
In [156]:
    
%%bash
## substitute new parameters into file
sed -i '/## 1. /c\empirical_1/halfrun    ## 1. working directory ' params.txt
sed -i '/## 6. /c\TGCAG                  ## 6. cutters ' params.txt
sed -i '/## 7. /c\30                     ## 7. N processors      ' params.txt
sed -i '/## 9. /c\6                      ## 9. NQual             ' params.txt
sed -i '/## 10./c\.85                    ## 10. clust threshold  ' params.txt
sed -i '/## 12./c\4                      ## 12. MinCov           ' params.txt
sed -i '/## 13./c\10                     ## 13. maxSH            ' params.txt
sed -i '/## 14./c\empirical_1_half_m4    ## 14. output name      ' params.txt
sed -i '/## 18./c\/home/deren/Documents/Vib_Lib1/fastq_Lib1/*.fastq ## 18. data location    ' params.txt
sed -i '/## 29./c\2,2                    ## 29. trim overhang    ' params.txt
sed -i '/## 30./c\p,n,s,a                ## 30. output formats   ' params.txt
    
In [157]:
    
cat params.txt
    
    
In [ ]:
    
%%bash
pyrad -p params.txt -s 234567 >> log.txt 2>&1
    
In [53]:
    
%%bash
sed -i '/## 12./c\2                         ## 12. MinCov           ' params.txt
sed -i '/## 14./c\empirical_1_half_m2       ## 14. output name      ' params.txt
    
In [35]:
    
%%bash
pyrad -p params.txt -s 7 >> log.txt 2>&1
    
In [141]:
    
%%bash
## substitute new parameters into file
sed -i '/## 1. /c\empirical_1/fullrun    ## 1. working directory ' params.txt
sed -i '/## 6. /c\TGCAG                  ## 6. cutters ' params.txt
sed -i '/## 7. /c\30                     ## 7. N processors      ' params.txt
sed -i '/## 9. /c\6                      ## 9. NQual             ' params.txt
sed -i '/## 10./c\.85                    ## 10. clust threshold  ' params.txt
sed -i '/## 12./c\4                      ## 12. MinCov           ' params.txt
sed -i '/## 13./c\10                     ## 13. maxSH            ' params.txt
sed -i '/## 14./c\empirical_1_full_m4    ## 14. output name      ' params.txt
sed -i '/## 18./c\/home/deren/Documents/Vib_Lib1/fastq_combined/*.fastq ## 18. data location    ' params.txt
sed -i '/## 29./c\2,2                    ## 29. trim overhang    ' params.txt
sed -i '/## 30./c\p,n,s,a                ## 30. output formats   ' params.txt
    
In [ ]:
    
%%bash
pyrad -p params.txt -s 234567 >> log.txt 2>&1
    
In [49]:
    
%%bash
sed -i '/## 12./c\2                         ## 12. MinCov           ' params.txt
sed -i '/## 14./c\empirical_1_full_m2       ## 14. output name      ' params.txt
    
In [142]:
    
%%bash
pyrad -p params.txt -s 7 >> log.txt 2>&1
    
In [3]:
    
## read in the data
s2dat = pd.read_table("empirical_1/halfrun/stats/s2.rawedit.txt", header=0, nrows=66)
## print summary stats
print s2dat["passed.total"].describe()
## find which sample has the most raw data
maxraw = s2dat["passed.total"].max()
print "\nmost raw data in sample:"
print s2dat['sample '][s2dat['passed.total']==maxraw]
    
    
In [4]:
    
## read in the data
s2dat = pd.read_table("empirical_1/fullrun/stats/s2.rawedit.txt", header=0, nrows=66)
## print summary stats
print s2dat["passed.total"].describe()
## find which sample has the most raw data
maxraw = s2dat["passed.total"].max()
print "\nmost raw data in sample:"
print s2dat['sample '][s2dat['passed.total']==maxraw]
    
    
In [33]:
    
## read in the s3 results
s3dat = pd.read_table("empirical_1/halfrun/stats/s3.clusters.txt", header=0, nrows=66)
## print summary stats
print "summary of means\n=================="
print s3dat['dpt.me'].describe()
## print summary stats
print "\nsummary of std\n=================="
print s3dat['dpt.sd'].describe()
## print summary stats
print "\nsummary of proportion lowdepth\n=================="
print pd.Series(1-s3dat['d>5.tot']/s3dat["total"]).describe()
## find which sample has the greatest depth of retained loci
maxdepth = s3dat["d>5.tot"].max()
print "\nhighest coverage in sample:"
print s3dat['taxa'][s3dat['d>5.tot']==maxdepth]
    
    
In [10]:
    
## read in the s3 results
s3dat = pd.read_table("empirical_1/fullrun/stats/s3.clusters.txt", header=0, nrows=66)
## print summary stats
print "summary of means\n=================="
print s3dat['dpt.me'].describe()
## print summary stats
print "\nsummary of std\n=================="
print s3dat['dpt.sd'].describe()
## print summary stats
print "\nsummary of proportion hidepth\n=================="
print pd.Series(1-s3dat['d>5.tot']/s3dat["total"]).describe()
## find which sample has the greatest depth of retained loci
max_hiprop = (s3dat["d>5.tot"]/s3dat["total"]).max()
print "\nhighest coverage in sample:"
print s3dat['taxa'][s3dat['d>5.tot']/s3dat["total"]==max_hiprop]
    
    
In [13]:
    
## print mean and std of coverage for the highest coverage sample
with open("empirical_1/fullrun/clust.85/lantanoides_D15_Beartown_2.depths", 'rb') as indat:
    depths = np.array(indat.read().strip().split(","), dtype=int)
    
print depths.mean(), depths.std()
    
    
In [16]:
    
import toyplot
import toyplot.svg
import numpy as np
## read in the depth information for this sample
with open("empirical_1/fullrun/clust.85/lantanoides_D15_Beartown_2.depths", 'rb') as indat:
    depths = np.array(indat.read().strip().split(","), dtype=int)
    
## make a barplot in Toyplot
canvas = toyplot.Canvas(width=350, height=300)
axes = canvas.axes(xlabel="Depth of coverage (N reads)", 
                   ylabel="N loci",
                   label="dataset1/sample=sulcatum_D9_MEX_003")
## select the loci with depth > 5 (kept)
keeps = depths[depths>5]
## plot kept and discarded loci
edat = np.histogram(depths, range(30)) # density=True)
kdat = np.histogram(keeps, range(30)) #, density=True)
axes.bars(edat)
axes.bars(kdat)
#toyplot.svg.render(canvas, "empirical_1_full_depthplot.svg")
    
    Out[16]:
    
In [83]:
    
cat empirical_1/halfrun/stats/empirical_1_half_m4.stats
    
    
In [51]:
    
import numpy as np
indat = open("empirical_1/halfrun/stats/empirical_1_half_m4.stats").readlines()
counts = [int(i.strip().split("\t")[1]) for i in indat[8:73]]
print np.mean(counts)
print np.std(counts)
    
    
In [47]:
    
import numpy as np
import itertools
indat = open("empirical_1/halfrun/stats/empirical_1_half_m4.stats").readlines()
counts = [i.strip().split("\t") for i in indat[81:142]]
#print counts
ntax = [int(i[0]) for i in counts]
ncounts = [int(i[1]) for i in counts]
tots = list(itertools.chain(*[[i]*n for i,n in zip(ntax, ncounts)]))
print np.mean(tots)
print np.std(tots)
    
    
In [85]:
    
cat empirical_1/fullrun/stats/empirical_1_full_m4.stats
    
    
In [49]:
    
import numpy as np
indat = open("empirical_1/fullrun/stats/empirical_1_full_m4.stats").readlines()
counts = [int(i.strip().split("\t")[1]) for i in indat[8:73]]
print np.mean(counts)
print np.std(counts)
    
    
In [50]:
    
import numpy as np
import itertools
indat = open("empirical_1/fullrun/stats/empirical_1_full_m4.stats").readlines()
counts = [i.strip().split("\t") for i in indat[81:140]]
#print counts
ntax = [int(i[0]) for i in counts]
ncounts = [int(i[1]) for i in counts]
tots = list(itertools.chain(*[[i]*n for i,n in zip(ntax, ncounts)]))
print np.mean(tots)
print np.std(tots)
    
    
In [ ]:
    
%%bash
## raxml argumement w/ ...
raxmlHPC-PTHREADS-AVX -f a -m GTRGAMMA -N 100 -x 12345 -p 12345 -T 35 \
                      -w /home/deren/Documents/RADmissing/empirical_1/halfrun \
                      -n empirical_1_halfrun -s empirical_1/halfrun/outfiles/empirical_1_half_m4.phy \
                      -o "Lib1_clemensiae_DRY6_PWS_2135"
    
In [ ]:
    
%%bash
raxmlHPC-PTHREADS-AVX -f a -m GTRGAMMA -N 100 -x 12345 -p 12345 -T 35 \
                      -w /home/deren/Documents/RADmissing/empirical_1/fullrun \
                      -n empirical_1_fullrun -s empirical_1/fullrun/outfiles/empirical_1_full_m4.phy \
                      -o "clemensiae_DRY6_PWS_2135"
    
In [86]:
    
%%bash 
head -n 20 empirical_1/halfrun/RAxML_info.empirical_1_half_m4
    
    
In [88]:
    
%%bash 
head -n 20 empirical_1/fullrun/RAxML_info.empirical_1_full_m4
    
    
In [1]:
    
%load_ext rpy2.ipython
    
In [132]:
    
%%R -w 600 -h 1000
library(ape)
tre_half <- read.tree("empirical_1/halfrun/RAxML_bipartitions.empirical_1_halfrun")
#rtre <- root(tre, "Lib1_clemensiae_DRY6_PWS_2135", resolve.root=T)
#rtre <- root(rtre, "Lib1_clemensiae_DRY6_PWS_2135", resolve.root=T)
ltre_half <- ladderize(tre_half)
plot(ltre_half, cex=0.8, edge.width=2)
nodelabels(ltre_half$node.label)
    
    
In [6]:
    
%%R -w 600 -h 1000
library(ape)
svg("outtree.svg", height=11, width=8)
tre_full <- read.tree("empirical_1/fullrun/RAxML_bipartitions.empirical_1_fullrun")
#rtre <- root(tre, "Lib1_clemensiae_DRY6_PWS_2135", resolve.root=T)
#rtre <- root(rtre, "Lib1_clemensiae_DRY6_PWS_2135", resolve.root=T)
ltre_full <- ladderize(tre_full)
plot(ltre_full, cex=0.8, edge.width=3)
#nodelabels(ltre_full$node.label)
dev.off()
    
    
The functions nexmake and subsample are used to split the .loci file into individual nexus files for each locus within a new directory. Each nexus file is given a mrbayes command to run. Then we run the bucky tool mbsum to summarize the mrbayes output, and finally run bucky to infer concordance trees from the posterior distributions of trees across all loci.
Loci are selected on the basis that they have coverage across all tips of the selected subtree and that they contain at least 1 SNP.
In [93]:
    
def nexmake(taxadict, loc, nexdir, trim):
    outloc = open(nexdir+"/"+str(loc)+".nex", 'w')
    header = """
#NEXUS 
begin data;
dimensions ntax={} nchar={};
format datatype=dna interleave=yes missing=N gap=-;
matrix
""".format(len(taxadict), len(taxadict.values()[0]))
    outloc.write(header)
    
    for tax, seq in taxadict.items():
        outloc.write("{}{}{}\n"\
                     .format(tax[trim:trim+9],
                             " "*(10-len(tax[0:9])),
                             "".join(seq)))
    
    mbstring = """
    ;
end;
begin mrbayes;
set autoclose=yes nowarn=yes;
lset nst=6 rates=gamma;
mcmc ngen=2200000 samplefreq=2000;
sump burnin=200000;
sumt burnin=200000;
end;    
"""
    outloc.write(mbstring)
    outloc.close()
    
In [102]:
    
def unstruct(amb):
    " returns bases from ambiguity code"
    D = {"R":["G","A"],
         "K":["G","T"],
         "S":["G","C"],
         "Y":["T","C"],
         "W":["T","A"],
         "M":["C","A"]}
    if amb in D:
        return D.get(amb)
    else:
        return [amb,amb]
            
def resolveambig(subseq):
    N = []
    for col in subseq:
        N.append([unstruct(i)[np.random.binomial(1, 0.5)] for i in col])
    return np.array(N)
    
def newPIS(seqsamp):
    counts = [Counter(col) for col in seqsamp.T if not ("-" in col or "N" in col)]
    pis = [i.most_common(2)[1][1] > 1 for i in counts if len(i.most_common(2))>1]
    if sum(pis) >= 2:
        return sum(pis)
    else:
        return 0
    
In [103]:
    
def parseloci(iloci, taxadict, nexdir, trim=0):
    nloc = 0
    ## create subsampled data set
    for loc in iloci:
        ## if all tip samples have data in this locus
        names = [line.split()[0] for line in loc.split("\n")[:-1]]
        ## check that locus has required samples for each subtree
        if all([i in names for i in taxadict.values()]):
            seqs = np.array([list(line.split()[1]) for line in loc.split("\n")[:-1]])
            seqsamp = seqs[[names.index(tax) for tax in taxadict.values()]]
            seqsamp = resolveambig(seqsamp)
            pis = newPIS(seqsamp)
            if pis:
                nloc += 1
                ## remove invariable columns given this subsampling
                keep = []
                for n, col in enumerate(seqsamp.T):
                    if all([i not in ["N","-"] for i in col]):
                        keep.append(n)
                subseq = seqsamp.T[keep].T
                ## write to a nexus file
                nexdict = dict(zip(taxadict.keys(), [i.tostring() for i in subseq]))
                nexmake(nexdict, nloc, nexdir, trim)
    print nloc, 'loci kept'
    
In [117]:
    
def getloci(locifile):
    ## parse the loci file by new line characters
    locifile = open(locifile)
    lines = locifile.readlines()
    ## add "|" to end of lines that contain "|"
    for idx in range(len(lines)):
        if "|" in lines[idx]:
            lines[idx] = lines[idx].strip()+"|\n"
        
    ## join lines back together into one large string
    locistr = "".join(lines)
    ## break string into loci at the "|\n" character
    loci = locistr.split("|\n")[:-1]
    ## how many loci?
    print len(loci), "loci"
    return loci
    
## run on both files
loci_full = getloci("empirical_1/fullrun/outfiles/empirical_1_full_m4.loci")
loci_half = getloci("empirical_1/halfrun/outfiles/empirical_1_half_m4.loci")
    
    
In [118]:
    
parseloci(loci_full[:], deep_dict_f, "deep_dict_full", 0)
parseloci(loci_half[:], deep_dict_h, "deep_dict_half", 0)
#parseloci(loci[:], shallow_dict, "shallow_dict", 0)
    
    
In [119]:
    
## create a parallel client
ipclient = ipyparallel.Client()
lview = ipclient.load_balanced_view()
## call function across all engines
def mrbayes(infile):
    import subprocess
    cmd = "mb %s" % infile
    subprocess.check_call(cmd, shell=True)
    
In [120]:
    
## submit all nexus files to run mb
allnex = glob.glob("deep_dict_full/*.nex")
for nex in allnex:
    lview.apply(mrbayes, nex)
ipclient.wait_interactive()
    
    
In [126]:
    
def mbsum(nexdir, nloci):     
    import subprocess
    ## combine trees from the two replicate runs
    for n in range(1, nloci+1):
        cmd = "mbsum -n 101 -o {}{}.in {}{}.nex.run1.t {}{}.nex.run2.t".\
              format(nexdir, n, nexdir, n, nexdir, n)
        subprocess.check_call(cmd, shell=True)
    
In [ ]:
    
    
In [3]:
    
import os
import numpy as np
from collections import Counter
def subsample(infile, requires, outgroup, nexdir, trim):
    """ sample n taxa from infile to create nex file"""
    ## counter
    loc = 0
    
    ## create output directory
    if not os.path.exists(nexdir):
        os.mkdir(nexdir)
    
    ## input .alleles file
    loci = open(infile, 'r').read().strip().split("//")
    
    ## create a dictionary of {names:seqs}
    for locus in xrange(len(loci)):
        locnex = [""]*len(requires)
        for line in loci[locus].strip().split("\n"):
            tax = line.split()[0]
            seq = line.split()[-1]
            if ">" in tax:
                if tax in requires:
                    locnex[requires.index(tax)] = seq
        ## if all tips
        if len([i for i in locnex if i]) == len(requires):
            ## if locus is variable
            ## count how many times each char occurs in each site
            ccs = [Counter(i) for i in np.array([list(i) for i in locnex]).T]
            ## remove N and - characters and the first most occurring base
            for i in ccs:
                del i['-']
                del i['N']
                if i:
                    del i[i.most_common()[0][0]]
                
            ## is anything left occuring more than once (minor allele=ma)?
            ma = max([max(i.values()) if i else 0 for i in ccs])
            if ma > 1:
                nexmake(requires, locnex, loc, outgroup, nexdir, trim)
                loc += 1
    
    return loc
    
In [162]:
    
## inputs
requires = [">triphyllum_D13_PWS_1783_0", 
            ">jamesonii_D12_PWS_1636_0",
            ">sulcatum_D9_MEX_003_0",
            ">acutifolium_DRY3_MEX_006_0",
            ">dentatum_ELS4_0",
            ">recognitum_AA_1471_83B_0"]
outgroup = ""
infile = "empirical_1/fullrun/outfiles/empirical_1_full_m4.alleles"
nexdir = "nex_files1"
## run function
nloci = subsample(infile, requires, outgroup, nexdir, trim=1)
print nloci
    
    
In [161]:
    
## inputs
requires = [">Lib1_triphyllum_D13_PWS_1783_0", 
            ">Lib1_jamesonii_D12_PWS_1636_0",
            ">Lib1_sulcatum_D9_MEX_003_0",
            ">Lib1_acutifolium_DRY3_MEX_006_0",
            ">Lib1_dentatum_ELS4_0",
            ">Lib1_recognitum_AA_1471_83B_0"]
outgroup = ""
infile = "empirical_1/halfrun/outfiles/empirical_1_half_m4.alleles"
nexdir = "nex_files2"
## run function
nloci = subsample(infile, requires, outgroup, nexdir, trim=6)
print nloci
    
    
In [16]:
    
## inputs
requires = [">clemensiae_DRY6_PWS_2135_0", 
            ">tinus_D33_WC_277_0",
            ">taiwanianum_TW1_KFC_1952_0",
            ">lantanoides_D15_Beartown_2_0",
            ">amplificatum_D3_SAN_156003_0",
            ">lutescens_D35_PWS_2077_0",
            ">lentago_ELS85_0",
            ">dentatum_ELS4_0"]
outgroup = ""
infile = "empirical_1/fullrun/outfiles/empirical_1_full_m4.alleles"
nexdir = "nex_files5"
## run function
nloci = subsample(infile, requires, outgroup, nexdir, trim=1)
print nloci
    
    
In [17]:
    
## inputs
requires = [">Lib1_clemensiae_DRY6_PWS_2135_0", 
            ">Lib1_tinus_D33_WC_277_0",
            ">Lib1_taiwanianum_TW1_KFC_1952_0",
            ">Lib1_lantanoides_D15_Beartown_2_0",
            ">Lib1_amplificatum_D3_SAN_156003_0",
            ">Lib1_lutescens_D35_PWS_2077_0",
            ">Lib1_lentago_ELS85_0",
            ">Lib1_dentatum_ELS4_0"]
outgroup = ""
infile = "empirical_1/halfrun/outfiles/empirical_1_half_m4.alleles"
nexdir = "nex_files6"
## run function
nloci = subsample(infile, requires, outgroup, nexdir, trim=6)
print nloci
    
    
In [19]:
    
import ipyparallel
import subprocess
import glob
## create a parallel client
ipclient = ipyparallel.Client()
lview = ipclient.load_balanced_view()
## call function across all engines
def mrbayes(infile):
    import subprocess
    cmd = "mb %s" % infile
    subprocess.check_call(cmd, shell=True)
    
In [164]:
    
## run on the full data set
res = lview.map_async(mrbayes, glob.glob("nex_files1/*"))
_ = res.get()
## run on the half data set
res = lview.map_async(mrbayes, glob.glob("nex_files2/*"))
_ = res.get()
    
In [72]:
    
## run on the half data set
res = lview.map_async(mrbayes, glob.glob("nex_files3/*"))
_ = res.get()
## run on the half data set
res = lview.map_async(mrbayes, glob.glob("nex_files4/*"))
_ = res.get()
    
In [20]:
    
## run on the half data set
res = lview.map_async(mrbayes, glob.glob("nex_files5/*"))
_ = res.get()
## run on the half data set
res = lview.map_async(mrbayes, glob.glob("nex_files6/*"))
_ = res.get()
    
In [21]:
    
import os
import subprocess
def mbsum(nexdir, nloci):
    ## create dir for bucky input files
    insdir = os.path.join(nexdir, "ins")
    if not os.path.exists(insdir):  
        os.mkdir(insdir)
                  
    ## combine trees from the two replicate runs
    for n in range(nloci):
        cmd = "mbsum -n 101 -o {}/{}.in {}{}.nex.run1.t {}{}.nex.run2.t".\
              format(insdir, n, nexdir, n, nexdir, n)
        subprocess.check_call(cmd, shell=True)
                          
#mbsum("nex_files1/", 3300)
#mbsum("nex_files2/", 364)
#mbsum("nex_files3/", 1692)
#mbsum("nex_files4/", 169)
mbsum("nex_files5/", 1203)
mbsum("nex_files6/", 106)
    
In [22]:
    
args = []
for insdir in ["nex_files5/ins", "nex_files6/ins"]:
    ## independence test
    args.append("bucky --use-independence-prior -k 4 -n 500000 \
                -o {}/BUCKY.ind {}/*.in".format(insdir, insdir))   
    ## alpha at three levels
    for alpha in [0.1, 1, 10]:
        args.append("bucky -a {} -k 4 -n 500000 -c 4 -o {}/BUCKY.{} {}/*.in".\
                    format(alpha, insdir, alpha, insdir))
        
    
def bucky(arg):
    import subprocess
    subprocess.check_call(arg, shell=True)
    return arg
    
res = lview.map_async(bucky, args)
res.get()
    
    Out[22]:
In [ ]:
    
del lbview
ipclient.close()
    
In [1]:
    
%%bash
head -n 40 nex_files1/ins/BUCKY.0.1.concordance
    
    
In [186]:
    
%%bash
head -n 40 nex_files1/ins/BUCKY.1.concordance
    
    
In [185]:
    
%%bash
head -n 40 nex_files2/ins/BUCKY.1.concordance
    
    
In [2]:
    
! head -n 45 nex_files3/ins/BUCKY.0.1.concordance
    
    
In [3]:
    
! head -n 45 nex_files4/ins/BUCKY.0.1.concordance
    
    
In [23]:
    
! head -n 45 nex_files5/ins/BUCKY.0.1.concordance
    
    
In [24]:
    
! head -n 45 nex_files6/ins/BUCKY.0.1.concordance
    
    
In [ ]:
    
%%bash
## raxml argumement w/ ...
raxmlHPC-PTHREADS-AVX -f a -m GTRGAMMA -N 100 -x 12345 -p 12345 -T 20 \
                      -w /home/deren/Documents/RADmissing/empirical_1/fullrun \
                      -n empirical_1_full_m2 -s empirical_1/fullrun/outfiles/empirical_1_m2.phy
    
In [27]:
    
%%bash 
head -n 20 empirical_1/fullrun/RAxML_info.empirical_1_full_m2
    
    
In [37]:
    
%%R
mean(cophenetic.phylo(ltre))