Import the functions (assumes that QTLight_functions.py is in your current working directory or in your python path)



In [ ]:

    
import QTLight_functions as QTL

Fetch relevant files from stacks populations run



In [ ]:

    
%%bash

ln -s test-data/batch_1.vcf.gz .
ln -s test-data/populationmap .
mkdir matrix

create 10 Bayenv input files with 5000 randomly selected loci in each



In [ ]:

    
%%bash
#pip install pyvcf

for a in {1..10}
do
    echo -e "\nrepitition $a:\n"
    python /home/chrishah/Dropbox/Github/genomisc/popogeno/vcf_2_bayenv.py batch_1.vcf.gz --min_number 6 -r 5000 -o matrix/random_5000_rep_$a -m populationmap
done

create 10 covariance matrizes with 100000 iterations each



In [ ]:

    
%%bash
cd matrix/

for a in {1..10}
do
    rand=$RANDOM
    echo -e "repitition $a (random seed: -$rand)\n"
    /home/chrishah/src/Bayenv/bayenv2 0 -p 4 -r -$RANDOM -k 100000 -i random_5000_rep_$a.bayenv.SNPfile > random_5000_rep_$a.log
    
done
cd ../

extract covariance matrizes from final iteration into txt file



In [ ]:

    
%%bash
dimensions=4

dimensions=$((dimensions+1))

for a in {1..10}
do
    tail -n $dimensions matrix/random_5000_rep_$a.log | grep "^$" -v > matrix/random_5000_rep_$a\_it-10e5.matrix
done

construct average covariance matrix from 10 random sets



In [ ]:

    
import numpy as np
main_list = []

for a in range(10):
    current = "matrix/random_5000_rep_"+str(a+1)+"_it-10e5.matrix"
#    print current
    IN = open(current,"r")
    temp_list = []
    for line in IN:
        temp_list.extend(line.rstrip().split("\t"))

    for i in range(len(temp_list)):
        if a == 0:
            main_list.append([float(temp_list[i])])
        else:
            main_list[i].append(float(temp_list[i]))
        
#print main_list

av_out_list = []
std_out_list = []
for j in range(len(main_list)):
    av_out_list.append(np.mean(main_list[j]))

#print av_out_list

outstring = ""
for z in range(len(av_out_list)):
    av_out_list[z] = "%s\t" %av_out_list[z]
    if not outstring:
        outstring = av_out_list[z]
    else:
        outstring = outstring+av_out_list[z]
        if ((z+1) % 4 == 0):
            outstring = "%s\n" %(outstring)


OUT = open("matrix/av_matrix.matrix","w")
OUT.write(outstring)
OUT.close()

Prepare environmental data - average and normalize

raw data is provided in a csv file with the first column containing the population id. See example in test-data.



In [ ]:

    
populations, IDs = QTL.normalize(csv='../Diplotaxodon_Morphometric_Data_raw.csv', normalize=True, norm_prefix='Diplotaxodon_Morphometric_Data_normalized', boxplot=False)



In [ ]:

    
print populations
print IDs

convert vcf to bayenv - generate full SNP files



In [ ]:

    
%%bash

mkdir SNPfiles
python /home/chrishah/Dropbox/Github/genomisc/popogeno/vcf_2_div.py ../batch_1.vcf.gz --min_number 6 -o SNPfiles/full_set -m ../populationmap

split up SNPfiles into single files



In [ ]:

    
QTL.split_for_Bayenv(infile='SNPfiles/full_set.bayenv.SNPfile', out_prefix='SNPfiles/Diplo_SNP')

Run Bayenv2 for 10 replications serially

for this run I used bayenv2 version: tguenther-bayenv2_public-48f0b51ced16



In [ ]:

    
#find the number of SNP files to add to specify in loop below
!ls -1 SNPfiles/SNP-* |wc -l



In [ ]:

    
!mkdir running_Bayenv



In [ ]:

    
%%bash
#adjust bayenv command to your requirements

iterations=1000000

cd running_Bayenv/

for rep in {1..10}; do ran=$RANDOM; for a in {0000001..0021968}; do /home/chrishah/src/Bayenv/bayenv2 -i ../SNPfiles/SNP-$a.txt -e ../Nyassochromis_normalized.bayenv -m ../matrix/av_matrix.matrix -k $iterations -r -$ran -p 3 -n 14 -t -X -o bayenv_out_k100000_env_rep_$rep-rand_$ran; done > log_rep_$rep; done

ALTERNATIVE

Bayenv can be run on a HPC cluster in parallel. I provide a script submit_Bayenv_array_multi.sh that I used to run 10 replicates as arrayjob on a cluster that was running a PBS scheduling system. Total runtime for 10 replicates with 1M Bayenv iterations/SNP was ~ 24h. The results from the individual runs were then concatenated with the script concat_sorted.sh and moved to the directory running_Bayenv on the local machine.

ANALYSE RANK STATISTICS

please make sure you load all functions below first

Calculating RANK STATISTICS



In [ ]:

    
mkdir RANK_STATISTIC/



In [ ]:

    
#create the list of Bayenv results files to be processed
import os

bayenv_res_dir = './running_bayenv/'
bayenv_files = []

for fil in os.listdir(bayenv_res_dir):
    if fil.endswith(".bf"):
        print(bayenv_res_dir+"/"+fil)
        bayenv_files.append(bayenv_res_dir+"/"+fil)



In [ ]:

    
print bayenv_files
print "\n%i" %len(bayenv_files)
print IDs



In [ ]:

    
rank_results = QTL.calculate_rank_stats(SNP_map="SNPfiles/full_set.bayenv.SNPmap", infiles = bayenv_files, ids = IDs, prefix = 'RANK_STATISTIC/Diplo_k_1M')

CREATE POPE PLOTS and extract the SNP ids in the top 5 percent (assumes that the script pope_plot.sh is in your working directory)



In [ ]:

    
print IDs



In [ ]:

    
full_rank_files = []

file_dir = 'RANK_STATISTIC/'
for id in IDs:
#    print id
    for file in os.listdir(file_dir):
        if file.endswith('_'+id+'.txt'):
#            print [id,file_dir+'/'+file]
            full_rank_files.append([id,file_dir+'/'+file])
            break



In [ ]:

    
print full_rank_files



In [ ]:

    
QTL.plot_pope(files_list=full_rank_files, cutoff=0.95, num_replicates=10)

CREATE POPE PLOTS and extract the SNP ids in the top 1 percent



In [ ]:

    
QTL.plot_pope(files_list=full_rank_files, cutoff=0.99, num_replicates=10)

find genes up and downstream of correlated SNPs



In [ ]:

    
#make list desired rank statistic tsv files

import os

file_dir = 'RANK_STATISTIC/'

rank_stats_files = []

for file in os.listdir(file_dir):
    if file.endswith('.tsv'):
            print file_dir+'/'+file
            rank_stats_files.append(file_dir+'/'+file)

parse a gff file



In [ ]:

    
gff_per_scaffold = QTL.parse_gff(gff='Metriaclima_zebra.BROADMZ2.gtf')

identify genes within a defined distance (in kb) up and down-stream of the SNPs



In [ ]:

    
genes_per_analysis = QTL.find_genes(rank_stats = rank_stats_files, gff = gff_per_scaffold, distance = 15)

annotated relevant genes based on blast2go annotation table



In [ ]:

    
QTL.annotate_genes(SNPs_to_genes=genes_per_analysis, annotations='blast2go_table_20150630_0957.txt')



In [ ]:

    
mkdir find_genes

write summary table for SNPs and relevant genes in the vicinity



In [ ]:

    
QTL.write_candidates(SNPs_to_genes=genes_per_analysis, whitelist=genes_per_analysis.keys(), out_dir='./find_genes/')



In [ ]:

A strategy for removing noise could be to remove the most extreme Bayenv results and recalculate rank stats



In [ ]:

    
mkdir RANK_STATISTIC_reduced



In [ ]:

    
QTL.exclude_extreme_rep(dictionary = rank_results, ids = IDs, prefix = 'RANK_STATISTIC_reduced/Diplotaxodon_reduced')



In [ ]:

    
reduced_rank_files = []

file_dir = 'RANK_STATISTIC_reduced/'
for id in IDs:
#    print id
    for file in os.listdir(file_dir):
        if '_'+id+'_ex_rep' in file and file.endswith('.txt'):
#            print [id,file_dir+'/'+file]
            reduced_rank_files.append([id,file_dir+'/'+file])
            break



In [ ]:

    
print reduced_rank_files



In [ ]:

    
QTL.plot_pope(files_list=reduced_rank_files, cutoff=0.95, num_replicates=9)

find genes up and downstream of correlated SNPs



In [ ]:

    
#make list desired rank statistic tsv files

import os

file_dir = 'RANK_STATISTIC_reduced/'

rank_stats_files = []

for file in os.listdir(file_dir):
    if file.endswith('.tsv'):
            print file_dir+'/'+file
            rank_stats_files.append(file_dir+'/'+file)



In [ ]:

    
mkdir find_genes_reduced/



In [ ]:

    
genes_per_analysis = QTL.find_genes(rank_stats = rank_stats_files, gff = gff_per_scaffold, distance = 15)



In [ ]:

    
QTL.annotate_genes(SNPs_to_genes=genes_per_analysis, annotations='blast2go_table_20150630_0957.txt')



In [ ]:

    
mkdir find_genes_reduced/



In [ ]:

    
QTL.write_candidates(SNPs_to_genes=genes_per_analysis, whitelist=genes_per_analysis.keys(), out_dir='./find_genes_reduced/')