Import the functions (assumes that QTLight_functions.py is in your current working directory or in your python path)
In [ ]:
import QTLight_functions as QTL
Fetch relevant files from stacks populations run
In [ ]:
%%bash
ln -s test-data/batch_1.vcf.gz .
ln -s test-data/populationmap .
mkdir matrix
create 10 Bayenv input files with 5000 randomly selected loci in each
In [ ]:
%%bash
#pip install pyvcf
for a in {1..10}
do
echo -e "\nrepitition $a:\n"
python /home/chrishah/Dropbox/Github/genomisc/popogeno/vcf_2_bayenv.py batch_1.vcf.gz --min_number 6 -r 5000 -o matrix/random_5000_rep_$a -m populationmap
done
create 10 covariance matrizes with 100000 iterations each
In [ ]:
%%bash
cd matrix/
for a in {1..10}
do
rand=$RANDOM
echo -e "repitition $a (random seed: -$rand)\n"
/home/chrishah/src/Bayenv/bayenv2 0 -p 4 -r -$RANDOM -k 100000 -i random_5000_rep_$a.bayenv.SNPfile > random_5000_rep_$a.log
done
cd ../
extract covariance matrizes from final iteration into txt file
In [ ]:
%%bash
dimensions=4
dimensions=$((dimensions+1))
for a in {1..10}
do
tail -n $dimensions matrix/random_5000_rep_$a.log | grep "^$" -v > matrix/random_5000_rep_$a\_it-10e5.matrix
done
construct average covariance matrix from 10 random sets
In [ ]:
import numpy as np
main_list = []
for a in range(10):
current = "matrix/random_5000_rep_"+str(a+1)+"_it-10e5.matrix"
# print current
IN = open(current,"r")
temp_list = []
for line in IN:
temp_list.extend(line.rstrip().split("\t"))
for i in range(len(temp_list)):
if a == 0:
main_list.append([float(temp_list[i])])
else:
main_list[i].append(float(temp_list[i]))
#print main_list
av_out_list = []
std_out_list = []
for j in range(len(main_list)):
av_out_list.append(np.mean(main_list[j]))
#print av_out_list
outstring = ""
for z in range(len(av_out_list)):
av_out_list[z] = "%s\t" %av_out_list[z]
if not outstring:
outstring = av_out_list[z]
else:
outstring = outstring+av_out_list[z]
if ((z+1) % 4 == 0):
outstring = "%s\n" %(outstring)
OUT = open("matrix/av_matrix.matrix","w")
OUT.write(outstring)
OUT.close()
Prepare environmental data - average and normalize
raw data is provided in a csv file with the first column containing the population id. See example in test-data.
In [ ]:
populations, IDs = QTL.normalize(csv='../Diplotaxodon_Morphometric_Data_raw.csv', normalize=True, norm_prefix='Diplotaxodon_Morphometric_Data_normalized', boxplot=False)
In [ ]:
print populations
print IDs
convert vcf to bayenv - generate full SNP files
In [ ]:
%%bash
mkdir SNPfiles
python /home/chrishah/Dropbox/Github/genomisc/popogeno/vcf_2_div.py ../batch_1.vcf.gz --min_number 6 -o SNPfiles/full_set -m ../populationmap
split up SNPfiles into single files
In [ ]:
QTL.split_for_Bayenv(infile='SNPfiles/full_set.bayenv.SNPfile', out_prefix='SNPfiles/Diplo_SNP')
Run Bayenv2 for 10 replications serially
for this run I used bayenv2 version: tguenther-bayenv2_public-48f0b51ced16
In [ ]:
#find the number of SNP files to add to specify in loop below
!ls -1 SNPfiles/SNP-* |wc -l
In [ ]:
!mkdir running_Bayenv
In [ ]:
%%bash
#adjust bayenv command to your requirements
iterations=1000000
cd running_Bayenv/
for rep in {1..10}; do ran=$RANDOM; for a in {0000001..0021968}; do /home/chrishah/src/Bayenv/bayenv2 -i ../SNPfiles/SNP-$a.txt -e ../Nyassochromis_normalized.bayenv -m ../matrix/av_matrix.matrix -k $iterations -r -$ran -p 3 -n 14 -t -X -o bayenv_out_k100000_env_rep_$rep-rand_$ran; done > log_rep_$rep; done
ALTERNATIVE
Bayenv can be run on a HPC cluster in parallel. I provide a script submit_Bayenv_array_multi.sh that I used to run 10 replicates as arrayjob on a cluster that was running a PBS scheduling system. Total runtime for 10 replicates with 1M Bayenv iterations/SNP was ~ 24h. The results from the individual runs were then concatenated with the script concat_sorted.sh and moved to the directory running_Bayenv on the local machine.
Calculating RANK STATISTICS
In [ ]:
mkdir RANK_STATISTIC/
In [ ]:
#create the list of Bayenv results files to be processed
import os
bayenv_res_dir = './running_bayenv/'
bayenv_files = []
for fil in os.listdir(bayenv_res_dir):
if fil.endswith(".bf"):
print(bayenv_res_dir+"/"+fil)
bayenv_files.append(bayenv_res_dir+"/"+fil)
In [ ]:
print bayenv_files
print "\n%i" %len(bayenv_files)
print IDs
In [ ]:
rank_results = QTL.calculate_rank_stats(SNP_map="SNPfiles/full_set.bayenv.SNPmap", infiles = bayenv_files, ids = IDs, prefix = 'RANK_STATISTIC/Diplo_k_1M')
CREATE POPE PLOTS and extract the SNP ids in the top 5 percent (assumes that the script pope_plot.sh is in your working directory)
In [ ]:
print IDs
In [ ]:
full_rank_files = []
file_dir = 'RANK_STATISTIC/'
for id in IDs:
# print id
for file in os.listdir(file_dir):
if file.endswith('_'+id+'.txt'):
# print [id,file_dir+'/'+file]
full_rank_files.append([id,file_dir+'/'+file])
break
In [ ]:
print full_rank_files
In [ ]:
QTL.plot_pope(files_list=full_rank_files, cutoff=0.95, num_replicates=10)
CREATE POPE PLOTS and extract the SNP ids in the top 1 percent
In [ ]:
QTL.plot_pope(files_list=full_rank_files, cutoff=0.99, num_replicates=10)
find genes up and downstream of correlated SNPs
In [ ]:
#make list desired rank statistic tsv files
import os
file_dir = 'RANK_STATISTIC/'
rank_stats_files = []
for file in os.listdir(file_dir):
if file.endswith('.tsv'):
print file_dir+'/'+file
rank_stats_files.append(file_dir+'/'+file)
parse a gff file
In [ ]:
gff_per_scaffold = QTL.parse_gff(gff='Metriaclima_zebra.BROADMZ2.gtf')
identify genes within a defined distance (in kb) up and down-stream of the SNPs
In [ ]:
genes_per_analysis = QTL.find_genes(rank_stats = rank_stats_files, gff = gff_per_scaffold, distance = 15)
annotated relevant genes based on blast2go annotation table
In [ ]:
QTL.annotate_genes(SNPs_to_genes=genes_per_analysis, annotations='blast2go_table_20150630_0957.txt')
In [ ]:
mkdir find_genes
write summary table for SNPs and relevant genes in the vicinity
In [ ]:
QTL.write_candidates(SNPs_to_genes=genes_per_analysis, whitelist=genes_per_analysis.keys(), out_dir='./find_genes/')
In [ ]:
A strategy for removing noise could be to remove the most extreme Bayenv results and recalculate rank stats
In [ ]:
mkdir RANK_STATISTIC_reduced
In [ ]:
QTL.exclude_extreme_rep(dictionary = rank_results, ids = IDs, prefix = 'RANK_STATISTIC_reduced/Diplotaxodon_reduced')
In [ ]:
reduced_rank_files = []
file_dir = 'RANK_STATISTIC_reduced/'
for id in IDs:
# print id
for file in os.listdir(file_dir):
if '_'+id+'_ex_rep' in file and file.endswith('.txt'):
# print [id,file_dir+'/'+file]
reduced_rank_files.append([id,file_dir+'/'+file])
break
In [ ]:
print reduced_rank_files
In [ ]:
QTL.plot_pope(files_list=reduced_rank_files, cutoff=0.95, num_replicates=9)
find genes up and downstream of correlated SNPs
In [ ]:
#make list desired rank statistic tsv files
import os
file_dir = 'RANK_STATISTIC_reduced/'
rank_stats_files = []
for file in os.listdir(file_dir):
if file.endswith('.tsv'):
print file_dir+'/'+file
rank_stats_files.append(file_dir+'/'+file)
In [ ]:
mkdir find_genes_reduced/
In [ ]:
genes_per_analysis = QTL.find_genes(rank_stats = rank_stats_files, gff = gff_per_scaffold, distance = 15)
In [ ]:
QTL.annotate_genes(SNPs_to_genes=genes_per_analysis, annotations='blast2go_table_20150630_0957.txt')
In [ ]:
mkdir find_genes_reduced/
In [ ]:
QTL.write_candidates(SNPs_to_genes=genes_per_analysis, whitelist=genes_per_analysis.keys(), out_dir='./find_genes_reduced/')