In [184]:
#For importing data and parsing data
from operator import itemgetter
import pprint
#Converting parsed data into raw parsed data output to csv
import csv
from itertools import islice, izip
#For analyzing raw parsed data
import collections, re
from collections import OrderedDict
#had to uninstall python-dateutil and use old version dateutil 2.2 to avoid error
#sudo pip uninstall python-dateutil
#sudo pip install python-dateutil==2.2
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.gridspec as gridspec
import seaborn as sns
#had to install this using pip on local computer
from natsort import natsorted, natsort_key
#Library to pull query genome chromosome information from CoGe API
import requests
In [185]:
#For importing data and parsing. SynMap_import used to build main d{} data structure. GFF_target_
synmap_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Sorghum-Maize/Oryza_Zea_16888_8082.txt'
gff_target_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Sorghum-Maize/Oryza_sativa_japonica_Rice.gid16888.gff'
#gff_query_import_file = ('')
d = {} # initialize dictionary to contain the array of syntenic genome1_chrs, genome1_genes, genome2_chrs, and genome2_genes
#Determines the Genus species of target chromosome
genus_species = ''
with open(gff_target_import_file) as gff_file:
for line in gff_file:
if line[0:15] == '##Organism name':
genus_species = line[17:-1]
species_name = genus_species.replace(' ','_')
species_name_filter = species_name.translate(None, '(){}[]')
#Parsed data and raw output to csv
gff_genes_target = {} # initializes dictionary for organization of genes on chromosomes within target genome according to start bp
#Output
gff_sort_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/brassica/ALL_GFF_sorted_"+str(species_name_filter)+ ".txt")
synmap_dictionary_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/brassica/ALL_dictionary_syntenic genes_" +str(species_name_filter)+ ".txt")
fract_bias_raw_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/brassica/ALL_fractbias_" +str(species_name_filter)+ "output.csv")
retention_calc_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/brassica/Window_output_"+str(species_name_filter+".csv"))
#Analysis of parsed data
target_lst = []
query_lst = []
##METHODS
#Identify each chromosome present in genomes from dictionaries made previously
def chr_id(input_dict):
for item in input_dict:
if not item in target_lst:
target_lst.append(item)
for gene in input_dict[item]:
for chr in input_dict[item][gene]:
if not chr in query_lst:
query_lst.append(chr)
#http://stackoverflow.com/questions/6822725/rolling-or-sliding-window-iterator-in-python
def window(seq, n):
"Returns a sliding window (of width n) over data from the iterable"
" s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... "
it = iter(seq)
result = tuple(islice(it, n))
if len(result) == n:
yield result
for elem in it:
result = result[1:] + (elem,)
yield result
In [186]:
#synmap_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Pina-Rice/pina_gdcoords_overlap40.txt'
#gff_target_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Pina-Rice/Ananas_comosus_pineapple_v6.gff'
In [187]:
#synmap_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Sorghum-Maize/SorghumMaize_gcoords_merge60_syntenicdepth40.txt'
#gff_target_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Sorghum-Maize/Sorghum_bicolor_6807_prodrun.gff'
In [188]:
##A. thaliana - Brassica rapa (1:3)
#synmap_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Brassicas/Athaliana_Brapa_gcoords_1-3.txt'
#gff_target_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Brassicas/Arabidopsis_thaliana.gid25869.gff'
##A. thaliana - Brassica napus (1:6)
#synmap_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Brassicas/Athaliana_Bnapus_1-6.txt'
#gff_target_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Brassicas/Arabidopsis_thaliana.gid25869.gff'
##Brassica napus - Brassica rapa (2:1)
#synmap_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Brassicas/Bnapus_Brapa_gcoords_2-1.txt'
#gff_target_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Brassicas/Brassica_rapa.gid24668.gff'
##OUTPUT
#gff_sort_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/brassica/ALL_GFF_sorted_"+str(species_name_filter)+ ".txt")
#synmap_dictionary_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/brassica/ALL_dictionary_syntenic genes_" +str(species_name_filter)+ ".txt")
#fract_bias_raw_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/brassica/ALL_fractbias_" +str(species_name_filter)+ "output.csv")
#retention_calc_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/brassica/Window_output_"+str(species_name_filter+".csv"))
In [189]:
#gar-trout
#synmap_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/fish/gar-trout_syndepth40_maxks1-47.txt'
#gff_target_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/fish/.gff'
#pike-trout
#synmap_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/fish/pike-trout_syndepth40_ksmax1-1.txt'
#gff_target_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/fish/Esox_lucius_pike_25161.gff'
#Output
#gff_sort_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/fish/ALL_GFF_sorted_"+str(species_name_filter)+ ".txt")
#synmap_dictionary_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/fish/ALL_dictionary_syntenic genes_" +str(species_name_filter)+ ".txt")
#gff_genes_target = {} # initializes dictionary for organization of genes on chromosomes within genome1 according to start bp
#fract_bias_raw_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/fish/ALL_fractbias_" +str(species_name_filter)+ "output.csv")
#Analysis of parsed data
#retention_calc_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/fish/Window_output_"+str(species_name_filter+".csv"))
In [190]:
#Theobroma-poplar (1:2)
#synmap_import_file = ('/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Vitis-poplar/Theobroma-poplar_1-2_gcoords.txt')
#gff_target_import_file = ('/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Vitis-poplar/Theobroma_cacao_gid10997.gff')
#OUTPUT
#
#
#Vitis-poplar (1:2)
#synmap_import_file = ('/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Vitis-poplar/Vitis_poplar_1-2_gcoords.txt')
#gff_target_import_file = ('/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataInput/Vitis-poplar/Vitis_vinifera_grape_gid19990.gff')
In [191]:
#Set target genome and query genome
args_target = "16888" #Oryza
#"25869" #arabidopsis
#'6807' #sorghum
#'25734' #pina
#'25161' #pike
#'25003' #gar
#'19990' #vitis vinifera
#'10997' #theobroma
#'24668' #Brassica rapa
#"16888" #Oryza
#Set whether to use all genes or
args_all_genes = True #True == use all genes in target genome; False == use only genes with at least one syntenic pair
window_size = 100
#NEW ARGS TO ADD TO GECO
args_query = '8082' #Zea mays
#'24668' #Brassica rapa
#'8082' #Zea mays
#'20192' #Brassica napus
args_numtargetchr = int(12)
args_numquerychr = int(10)
args_remove_random_unknown = True #True == removes random and unknown chromosome names, False == removes nothing
In [192]:
"""Calls Genome Fetch CoGE API to get query genome and target chromosomes and orders them"""
#retrieves api chromsome lists and length and moves json() object into Python dictionary
query_api = requests.get("https://genomevolution.org/coge/api/v1/genomes/" + str(args_query))
target_api = requests.get("https://genomevolution.org/coge/api/v1/genomes/" + str(args_target))
query_api = query_api.json()
target_api = target_api.json()
#initializes dictionaries to drop api data into
target_api_chrs = []
target_api_chrs_sorted = []
target_api_chrs_sorted_filtered = []
target_api_chrs_sorted_name = []
target_api_chrs_final = []
query_api_chrs = []
query_api_chrs_sorted = []
query_api_chrs_sorted_filtered = []
query_api_chrs_sorted_name = []
query_api_chrs_final = []
#query genome chromosomes parsed, restricted by length largest -> smallest, and sorted according to name
for chr in query_api['chromosomes']:
if args_remove_random_unknown == True:
if "Random" in chr['name'] or "random" in chr['name'] or "Unknown" in chr['name'] or "unknown" in chr['name']:
continue
else:
query_api_chrs.append((chr['name'], chr['length']))
if args_remove_random_unknown == False:
query_api_chrs.append((chr['name'], chr['length']))
query_api_chrs_sorted = natsorted(query_api_chrs, key=lambda chr: chr[1], reverse=True)
query_api_chrs_sorted_filtered = query_api_chrs_sorted[0:args_numquerychr]
query_api_chrs_sorted_name = natsorted(query_api_chrs_sorted_filtered, key=lambda chr: chr[0])
for chr in query_api_chrs_sorted_name:
query_api_chrs_final.append(chr[0])
#target genome chromosomes parsed, restricted by length largest ->smallest, and sorted according to name
for chr in target_api['chromosomes']:
if args_remove_random_unknown == True:
if "Random" in chr['name'] or "random" in chr['name'] or "Unknown" in chr['name'] or "unknown" in chr['name']:
continue
else:
target_api_chrs.append((chr['name'], chr['length']))
if args_remove_random_unknown == False:
target_api_chrs.append((chr['name'], chr['length']))
target_api_chrs_sorted = natsorted(target_api_chrs, key=lambda chr: chr[1], reverse=True)
target_api_chrs_sorted_filtered = target_api_chrs_sorted[0:args_numtargetchr]
target_api_chrs_sorted_name = natsorted(target_api_chrs_sorted_filtered, key=lambda chr: chr[0])
for chr in target_api_chrs_sorted_name:
target_api_chrs_final.append(chr[0])
In [193]:
"""
Reads SynMap files and parses data into columns in array
"""
d = {}
with open(synmap_import_file, 'r') as f: # open SynMap file containing syntenic genes
synmap_rowcount = 0
cols = [] # list for parsing columns from SynMap data
decimal_strip_check_target_gene = 0
for line in f: # for loop to parse columns
new_line = line.replace('||', '\t') #converts || into tabs for universal delimination
if line[0] != '#' and line[0] != '\n': #sorts out columns containing syntenic block information/headings
cols = new_line.split('\t', ) #splits all syntenic gene pair lines into parsed columns in a list
synmap_rowcount += 1
global target_chr
global target_gene
global query_chr
global query_gene
if synmap_rowcount == 1:
#clean ID subgenome A from the column on left of data
ida = cols[0]
ida = ida[1:cols[0].index('_')]
#Determines which are target and query genes
if args_target == ida:
target_chr = cols[1]
if any(target_chr in s for s in target_api_chrs_final):
target_gene = str(cols[7]) #.rsplit(".", 1)[0] #puts all genome1_genes with synteny into a list
#decimal_strip_check_target_gene = target_gene.find('.')
#if not decimal_strip_check_target_gene == (-1):
#target_gene = target_gene[:(decimal_strip_check_target_gene)]
else:
continue
query_chr = str(cols[13]) #puts all genome2_chrs with synteny to genes in genome1 into a list
if any(query_chr in s for s in query_api_chrs_final):
query_gene = str(cols[19]) #.rsplit(".", 1)[0] #puts all genome2_genes with synteny to genes in a genome1 into a list
else:
continue
else:
target_chr = cols[13]
if any(target_chr in s for s in target_api_chrs_final):
target_gene = str(cols[19])
#decimal_strip_check_target_gene = target_gene.find('.')
#if not decimal_strip_check_target_gene == (-1):
# target_gene = target_gene[:(decimal_strip_check_target_gene)]
#puts all genome1_genes with synteny into a list
else:
continue
query_chr = str(cols[1]) #puts all genome2_chrs with synteny to genes in genome1 into a list
if any(query_chr in s for s in query_api_chrs_final):
query_gene = cols[7] #.rsplit(".", 1)[0] #puts all genome2_genes with synteny to genes in a genome1 into a list
else:
continue
if not target_chr in d:
d[target_chr] = {} #initializes the nested dictionary-primary level at genome1_chromosome
if not target_gene in d[target_chr]:
d[target_chr][target_gene] = {} #initializes first nesting in dictionary-second level at genome1_genes
if not query_chr in d[target_chr][target_gene]:
d[target_chr][target_gene][query_chr] = query_gene #initializes nested dictionary-third level at genome2_chr
#check lengths of synmap data structure
"""for tchr in d:
#print "Target Chromosome"
print tchr
#print len(d[tchr])
for tgene in d[tchr]:
#print "length of target gene"
#print len(d[tchr][tgene])
for qchr in d[tchr][tgene]:
#print "length of qchr"
#print len(d[tchr][tgene][qchr])
print " " + str(qchr)
#print d[tchr][tgene][qchr]"""
Out[193]:
In [194]:
"Create hashable data structure to look up gene/CDS name. CDS_name_dict: CDS_key_name: 1 if TRUE"
CDS_name_dict = {} #initializes the CDS name dictionary for looking up if CDS already have been identified
gff_genes_target = {}
'''Reads GFF from target genome and puts into data strcuture gff_genes'''
with open(gff_target_import_file, 'r') as g: # opens gff file
gffcols = [] #list of parsed gff columns
chr = [] #initialize list of chromosomes present in genome1 gff file
for line in g:
new_line = line.replace(';', '\t') #makes subdelims universal in gff file from CoGe
new_line = new_line.replace('Name=', '') #strips Name= off gene_name in gff file from CoGe
if new_line[0] != '#' and new_line[0] != '\n': #selects only lines with CDS information
gffcols = new_line.split('\t', ) #parses all column
if (any(gffcols[0] in s for s in target_api_chrs_final) and (gffcols[2] == 'CDS')): #selects only 'CDS' lines for consideration
chr = gffcols[0] #adds genome1_chrs to list
if not chr in gff_genes_target:
gff_genes_target[chr] = [] #initializes chr list in dictionary if chr does not exist yet
gene_name = str(gffcols[-1]) #[9]).rsplit(".", 1)[0]
#gene_name = gene_name.rsplit(".", 1)[0]
gene_name = gene_name.rstrip("\n")
gene_name1 = gene_name[9:] #adds targetgenome_genes to list and removes the "ID=" added by CoGe
start = int(gffcols[3]) #adds targetgenome_gene start bp to list for ordering as integer
stop = int(gffcols[4]) #adds targetgenome_gene stop bp to list ?for ordering? as integer
try:
CDS_name_dict[str(gene_name1)] == 1
continue
except KeyError:
gff_genes_target[chr].append(dict(gene_name=gene_name1, start=start, stop=stop))
CDS_name_dict[str(gene_name1)] = 1
#Checking GFF and ordered chromosome data structures
"""print "Length of CDS_NAME_DICT:"
print len(CDS_name_dict)
sm = 0
for tchr in gff_genes_target:
#if tchr == "A01":
# print gff_genes_target[tchr]
print "Length of gff_genes_target_tchr" + str(tchr)
print len(gff_genes_target[tchr])
sm = sm + len(gff_genes_target[tchr])
print sm"""
Out[194]:
In [195]:
'''Sorts GFF genes within target chromosomes by start position'''
gff_sort_all = {}
gff_sort_all = natsorted(gff_genes_target)
for chr in gff_genes_target:
gff_genes_sorted = sorted(gff_genes_target[chr], key=itemgetter('start')) #Creates dictionary for searching genes against::CONSIDER sorting on midpoint of genes rather than
gff_genes_target[chr] = gff_genes_sorted
#CONSIDER WRITING A CHECK PROGRAM TO RETURN TRUE IF ALL VALUES ARE SORTED OR FALSE
In [196]:
'''Writes out SynMap dictionary and sorted GFF gene list to document for parsed output'''
with open(str(gff_sort_output_file), 'w') as h:
h.write(str(gff_genes_target))
with open(synmap_dictionary_output_file, 'w+') as i:
i.write(str(d))
If args_all_genes == True If args_all_genes == False: SynMap data is filtered to remove any target genome genes that do not match to any genes present on the query genome chromosomes. Specifically: any {target_chr: target_gene:} that is 'False' for all query chromosomes
In [197]:
'''Determine syntenic gene pairs present and output Raw Data CSV file from parsed data'''
#Deprecated method call to determine genome chrs
chr_id(d)
target_lst = natsorted(target_lst)
query_lst = natsorted(query_lst)
print query_api_chrs_final
windanalysis_input_dict = {}
with open(str(fract_bias_raw_output_file), 'w') as csvfile:
headers = ['Target Chromosome', 'Target Gene Name', 'Gene Order on Target Chromosome']
headers.extend(query_api_chrs_final)
headers.extend(query_api_chrs_final)
writer = csv.writer(csvfile, dialect='excel', delimiter=',', lineterminator='\n')
writer.writerow(headers)
if args_all_genes == True:
for tchr in gff_genes_target:
col0 = chr #writes Pineapple chr number
count = 0
for diction in gff_genes_target[tchr]:
gene = diction['gene_name']
col1 = gene #writes gene name
count += 1
col2 = count #writes pineapple gene number on pineapple chr
#Find the query chr genes and output to columns: first gff info (col0-3), query chr (col 4-n), query chr-gene (col n+1-m)
syntenic_query_gene_presence_data = []
syntenic_query_gene_name = []
for qchr in query_api_chrs_final:
if not tchr in windanalysis_input_dict:
windanalysis_input_dict[tchr] = {} #initializes the nested dictionary-primary level at genome1_chromosome
if not qchr in windanalysis_input_dict[tchr]:
windanalysis_input_dict[tchr][qchr] = [] #initializes first nesting in dictionary-second level at genome1_genes
try:
syn_gene = d[tchr][gene][qchr]
syntenic_query_gene_presence_data.append(True)
syntenic_query_gene_name.append(syn_gene)
windanalysis_input_dict[tchr][qchr].append(True)
except KeyError:
syntenic_query_gene_presence_data.append(False)
syntenic_query_gene_name.append("")
windanalysis_input_dict[tchr][qchr].append(False)
rows = [tchr, col1, col2]
rows.extend(syntenic_query_gene_presence_data)
rows.extend(syntenic_query_gene_name)
writer.writerow(rows)
elif args_all_genes == False:
for tchr in gff_genes_target:
col0 = chr #writes target chr number
count = 0
for diction in gff_genes_target[tchr]:
gene = diction['gene_name']
col1 = gene #writes target gene name
count += 1
col2 = count #writes target gene number on target chr
#Find the query chr genes and output to columns: first gff info (col0-3), query chr (col 4-n), query chr-gene (col n+1-m)
syntenic_query_gene_presence_data = []
syntenic_query_gene_name = []
for qchr in query_api_chrs_final:
if not tchr in windanalysis_input_dict:
windanalysis_input_dict[tchr] = {} #initializes the nested dictionary-primary level at genome1_chromosome
if not qchr in windanalysis_input_dict[tchr]:
windanalysis_input_dict[tchr][qchr] = [] #initializes first nesting in dictionary-second level at genome1_genes
try:
syn_gene = d[tchr][gene][qchr]
syntenic_query_gene_presence_data.append(True)
except KeyError:
syntenic_query_gene_presence_data.append(False)
if sum(syntenic_query_gene_presence_data) >= 1:
for qchr in query_api_chrs_final:
try:
syn_gene = d[tchr][gene][qchr]
syntenic_query_gene_name.append(syn_gene)
windanalysis_input_dict[tchr][qchr].append(True)
except KeyError:
syntenic_query_gene_name.append("")
windanalysis_input_dict[tchr][qchr].append(False)
rows = [tchr, col1, col2]
rows.extend(syntenic_query_gene_presence_data)
rows.extend(syntenic_query_gene_name)
writer.writerow(rows)
elif sum(syntenic_query_gene_presence_data) < 1:
continue
else:
print 'Target gene not classified'
continue
else:
print "Genes to be used (all genes or at least one syntenic gene) are not defined"
##FOR INVESTIGATING COMPARISON DATA STRUCTURE
"""print windanalysis_input_dict
print "Number of windanalysis_input chromosomes = "+str(len(windanalysis_input_dict))
for tchr in windanalysis_input_dict:
print "Name of windanalysis_input target chromosome = " + str(tchr)
print len(windanalysis_input_dict[tchr])
for qchr in windanalysis_input_dict[tchr]:
print qchr
print len(windanalysis_input_dict[tchr][qchr])"""
Out[197]:
In [198]:
# '''Analysis: for each chromosome in genome1 read the genes on a chromosome and compare to subgenome array of syntenic genes'''
data_output0 = []
data_output1 = []
data_output2 = []
data_output3 = []
output_dict = {}
alphanum_output_dict = {}
numsorted_output_dict = {}
#Process windows 100genes/sliding window and
#output to nested dictionary data structure output_dict{target chr:}{query chr}{window count:retention%}
for tchr in windanalysis_input_dict:
tchr_counter = tchr
for qchr in windanalysis_input_dict[tchr]:
counter = 0
qchr_counter = qchr
if not tchr in output_dict:
output_dict[tchr] = {} #initializes the nested dictionary-primary level at genome1_chromosome
if not qchr in output_dict[tchr]:
output_dict[tchr][qchr] = {} #initializes first nesting in dictionary-second level at genome1_genes
try:
if (int(len(windanalysis_input_dict[tchr][qchr]))) >= window_size:
for each in window(windanalysis_input_dict[tchr][qchr], window_size):
counter += 1
data_output2 = float(sum(each)) / float(window_size)
output_dict[tchr][qchr][counter] = round(data_output2*100.)
except KeyError:
continue
#Sort output_dict for tchr alphanumberic at top level
#if tchr are only integers (not alpha&numeric) the except statement will sort just integers
'''try:
alphanumbsort = lambda k,v: [k, int(v)]
output_dict = collections.OrderedDict(sorted(output_dict.items(), key=lambda t: alphanumbsort(*re.match(r'([a-zA-Z]+)(\d+)',t[0]).groups())))
print "alphanum"
except ValueError:
print "value_error"
except AttributeError:
#for tchr in output_dict:
#for qchr in output_dict[tchr]:
#numb_sort = lambda k,v: [int(k), int(v)]
numsorted_output_dict = sorted(output_dict.items(), key=lambda t: int(t[0])) #t: numb_sort(, t[0].groups())))
#numsorted_output_dict = sorted(numsorted_output_dict.keys(), key=lambda k: numsorted_output_dict[k])
output_dict = numsorted_output_dict
print "numsorted"
#print output_dict'''
Out[198]:
In [199]:
'''with open(retention_calc_output_file, 'wb') as csvf:
headers = ['Target Chromosome', 'Query Chromosome' 'Window Iteration (x-axis)']
headers.extend(query_lst)
writer = csv.writer(csvf, dialect='excel', delimiter=',', lineterminator='\n')
writer.writerow(headers)
for tchr in windanalysis_input_dict:
for qchr in windanalysis_input_dict[tchr]:
#Prints into two columns
writer.writerows(izip( output_dict[tchr][qchr]))'''
##Statistics Output NEEDS FIXING
#for tchr in output_dict:
#for qchr in output_dict[tchr]:
#print np.mean(output_dict[tchr][qchr])
#print np.median_grouped(output_dict[tchr][qchr])
Out[199]:
In [200]:
#Check size of SynMap nested dictionary d data structure
#for tchr in d:
# print "Target Chromosome"
# print tchr
# print len(d[tchr])
# for tgene in d[tchr]:
# print len(d[tchr][tgene])
#print d
#Check size of sorted GFF data structure
#print gff_genes_target
#print len(gff_genes_target)
#print windanalysis_input_dict
#print output_dict
#print (max(output_dict.itervalues()))
countofchrgraph = 0
listofchrgraph = []
for tchr in output_dict:
sumofchr = 0
for qchr in output_dict[tchr]:
try:
(max(output_dict[tchr][qchr].itervalues()))
sumofchr = sumofchr + (max(output_dict[tchr][qchr].itervalues()))
except ValueError:
continue
if sumofchr > 5:
countofchrgraph += 1
listofchrgraph.append(str(tchr))
listofchrgraph = natsorted(listofchrgraph)
print listofchrgraph
print countofchrgraph
type(countofchrgraph)
print len(target_lst)
print len(output_dict)
print len(query_api_chrs_final)
In [201]:
%matplotlib inline
#define figure size, column layout, grid layout
figsize = (15, 35) #(len(query_api_chrs_final)*1.00))
cols = 2
gs = gridspec.GridSpec(len(output_dict) // cols + 1, cols)
# These are the "Tableau 20" colors as RGB
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
(44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
(148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts
for i in range(len(tableau20)):
r, g, b = tableau20[i]
tableau20[i] = (r / 255., g / 255., b / 255.)
#Alternate color palettes Seaborn/Brewer
current_palette = sns.color_palette("husl", 40) #("Set2", 30)
#tableau20
fig = plt.figure(figsize=figsize, frameon=False)
subplt_count = -1
ax = []
for tchr in listofchrgraph:
subplt_count += 1
print "Plotting subplot: "+str(subplt_count)
count = 0
row = (subplt_count // cols)
col = subplt_count % cols
ax.append(fig.add_subplot(gs[row, col]))
for qchr in output_dict[tchr]:
count += 1
try:
if (max(output_dict[tchr][qchr].itervalues()))>5:
x = output_dict[tchr][qchr].keys()
y = output_dict[tchr][qchr].values()
#Sets up plotting conditions
ax[-1].spines["top"].set_visible(False)
ax[-1].spines["right"].set_visible(False)
ax[-1].spines["left"].set_visible(True)
ax[-1].spines["bottom"].set_visible(True)
ax[-1].patch.set_visible(False)
ax[-1].get_xaxis().tick_bottom()
ax[-1].get_yaxis().tick_left()
ax[-1].plot(x, y, color=current_palette[count], lw=2, label=str(qchr))
ax[-1].set_title(label='Target Chromosome: '+species_name_filter+" "+ tchr, fontweight='bold', fontsize=14, y=1.1, loc='left')
ax[-1].set_xlabel('Window Iteration\n(Gene number)', fontsize=12, fontweight='bold')
ax[-1].set_ylabel('% Retention\n(#Syntenic genes/window size)', fontsize=12, fontweight='bold')
ax[-1].legend(bbox_to_anchor=(1.25, 1.13), loc=1, frameon=False, title=" Query\nChromosome", fontsize=10)
else:
continue
except ValueError:
continue
fig.tight_layout(pad=2, w_pad = 6)
#fig.subplots_adjust(wspace=0.45, hspace=0.35)
plt.savefig("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/fractbias_figure"+str(species_name_filter)+".png", transparent=False)
In [ ]:
In [202]:
"""Export Processed Fractional Bias Data to JSON File
import json
import csv
with open("myFile.json", "w") as f:
json.dump(output_dict,f)
x = json.loads(output_dict)
f = csv.writer(open("test.csv", "wb+"))
# Write CSV Header, If you dont need that, remove this line
#f.writerow(["pk", "model", "codename", "name", "content_type"])
for x in x:
f.writerow([x["tchr"],
x["tchr"]["qchr"]])
"""
Out[202]: