notebook.community

Edit and run



In [1]:

    
"""Imports"""


#For importing data and parsing data
from operator import itemgetter
import pprint


#Converting parsed data into raw parsed data output to csv
import csv
from itertools import islice, izip


#For analyzing raw parsed data
import collections, re

    #had to uninstall python-dateutil and use old version dateutil 2.2 to avoid error
    #sudo pip uninstall python-dateutil
    #sudo pip install python-dateutil==2.2
import matplotlib.pyplot as plt
import numpy as np 
import matplotlib.gridspec as gridspec


#had to install this using pip on local computer
from natsort import natsorted, natsort_key
import matplotlib



In [2]:

    
"""Methods and Global Variables"""


#For importing data and parsing
synmap_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/GecoFractBias/16888_25462.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.Dm120.ma1.qac2.1.10.gcoords.txt'
gff_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/GecoFractBias/Ananas_comosus_pineapple_annos1-cds0-id_typename-nu1-upa1-add_chr0.gid25462.gff'
d = {}  # initialize dictionary to contain the array of syntenic genome1_chrs, genome1_genes, genome2_chrs, and genome2_genes
genus_species = ''
with open(gff_import_file) as gff_file:
    for line in gff_file:
        if line[0:15] == '##Organism name':
            genus_species = line[17:-1]
            species_name = genus_species.replace(' ','_')
            species_name_filter = species_name.translate(None, '(){}[]')

#Parsed data and raw output to csv
gff_sort_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/Pina-Rice/geco_output/ALL_GFF_sorted_"+str(species_name_filter)+ ".txt")
synmap_dictionary_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/Pina-Rice/geco_output/ALL_dictionary_syntenic genes_" +str(species_name_filter)+ ".txt")
gff_genes = {}  # initializes dictionary for organization of genes on chromosomes within genome1 according to start bp
fract_bias_raw_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/Pina-Rice/geco_output/ALL_fractbias_" +str(species_name_filter)+ "output.csv")

#Analysis of parsed data
retention_calc_output_file = ("/Users/bjoyce3/Desktop/SynMapFractBiasAnalysis/DataOutput/Pina-Rice/geco_output/Window_output_"+str(species_name_filter+".csv"))
target_lst = []
query_lst = []

def chr_id(input_dict):
    for item in input_dict:
        if not item in target_lst:
            target_lst.append(item)
        for gene in input_dict[item]:
            for chr in input_dict[item][gene]:
                if not chr in query_lst:
                    query_lst.append(chr)

#http://stackoverflow.com/questions/6822725/rolling-or-sliding-window-iterator-in-python
def window(seq, n):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result
        
#DATAPaths
#synmap_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasInput/Pina-rice/SynMapKsMerge120Sdepth10_Osativa-Acomosus2-1Acomv6.txt'

#gff_import_file = '/Users/bjoyce3/Desktop/SynMapFractBiasInput/Sorghum-MaizeSynMap/Sorghum_bicolor_annos1-cds0-id_typename-nu1-upa1-add_chr0.gid6807.gff'



In [3]:

    
"""Importing
Reads SynMap and GFF CDS files and parse data into columns in array
"""


with open(synmap_import_file, 'r') as f:  # open SynMap file containing syntenic genes
    cols = []  # list for parsing columns from SynMap data
    for line in f:  # for loop to parse columns
        new_line = line.replace('||', '\t')  #converts || into tabs for universal delimination
        if line[0] != '#' and line[0] != '\n':  #sorts out columns containing syntenic block information/headings
            cols = new_line.split('\t', )  #splits all syntenic gene pair lines into parsed columns in a list
            global target_chr
            global target_gene
            global query_chr
            global query_gene
            target_chr = cols[15]
            target_gene = str(cols[18])  #puts all genome1_genes with synteny into a list
            query_chr = str(cols[3])  #puts all genome2_chrs with synteny to genes in genome1 into a list
            query_gene = str(cols[6])  #puts all genome2_genes with synteny to genes in a genome1 into a list

            if not target_chr in d:
                d[target_chr] = {}  #initializes the nested dictionary-primary level at genome1_chromosome
            if not target_gene in d[target_chr]:
                d[target_chr][target_gene] = {}  #initializes first nesting in dictionary-second level at genome1_genes
            if not query_chr in d[target_chr][target_gene]:
                d[target_chr][target_gene][query_chr] = query_gene  #initializes nested dictionary-third level at genome2_chr
            
            #print cols[15]



In [4]:

    
'''Reads GFF from genome1 (target) and parses data'''
with open(gff_import_file, 'r') as g:  # opens gff file
    gffcols = []  #list of parsed gff columns
    chr = []  #initialize list of chromosomes present in genome1 gff file
    for line in g:
        new_line = line.replace(';', '\t')  #makes subdelims universal in gff file from CoGe
        new_line = new_line.replace('Name=', '')  #strips Name= off gene_name in gff file from CoGe
        #new_line = new_line.replace('LG', '')

        if new_line[0] != '#' and new_line[0] != '\n':  #selects only lines with CDS information
            gffcols = new_line.split('\t', )  #parses all columns
            if gffcols[2] == 'mRNA' and 'scaffold' not in gffcols[0]:  #selects only 'mRNA' lines for consideration
                chr = gffcols[0]  #adds genome1_chrs to list
                gene_name = gffcols[10]  #adds genome1_genes to list
                start = int(gffcols[3])  #adds genome1_gene start bp to list for ordering as integer
                stop = int(gffcols[4])  #adds genome1_gene stop bp to list ?for ordering? as integer
                if not chr in gff_genes:
                    gff_genes[chr] = []  #initializes chr list in dictionary if chr does not exist yet
                gff_genes[chr].append(dict(gene_name=gene_name, start=start, stop=stop))

'''Sorts GFF genes within chromosome by start position'''
for chr in gff_genes:
    gff_genes_sorted = sorted(gff_genes[chr], key=itemgetter('start'))  #Creates dictionary for searching genes against::CONSIDER sorting on midpoint of genes rather than
    gff_genes[chr] = gff_genes_sorted

    #CONSIDER WRITING A CHECK PROGRAM TO RETURN TRUE IF ALL VALUES ARE SORTED OR FALSE



In [5]:

    
'''Writes out SynMap dictionary and sorted GFF gene list to document for parsed output'''

with open(str(gff_sort_output_file), 'w') as h:
	h.write(str(gff_genes))
with open(synmap_dictionary_output_file, 'w+') as i:
    i.write(str(d))



In [ ]:

    
'''Determine syntenic gene pairs present and output Raw Data CSV file from parsed data'''


chr_id(d)
target_lst = natsorted(target_lst)
query_lst = natsorted(query_lst)
windanalysis_input_dict = {}

with open(str(fract_bias_raw_output_file), 'w') as csvfile:
    headers = ['Target Chromosome', 'Target Gene Name', 'Gene Order on Target Chromosome']
    headers.extend(query_lst)
    headers.extend(query_lst)
    writer = csv.writer(csvfile, dialect='excel', delimiter=',', lineterminator='\n')
    writer.writerow(headers)

    for tchr in gff_genes:
        col0 = chr #writes Pineapple chr number
        count = 0
        for diction in gff_genes[tchr]:
            gene = diction['gene_name']
            col1 = gene #writes pineapple gene name
            count += 1
            col2 = count #writes pineapple gene number on pineapple chr
            #Find the query chr genes and output to columns: first gff info (col0-3), query chr (col 4-n), query chr-gene (col n+1-m)
            syntenic_query_gene_presence_data = []
            syntenic_query_gene_name = []
            for qchr in query_lst:
                if not tchr in windanalysis_input_dict:
                    windanalysis_input_dict[tchr] = {}  #initializes the nested dictionary-primary level at genome1_chromosome
                if not qchr in windanalysis_input_dict[tchr]:
                    windanalysis_input_dict[tchr][qchr] = []  #initializes first nesting in dictionary-second level at genome1_genes
                try:
                    syn_gene = d[tchr][gene][qchr]
                    syntenic_query_gene_presence_data.append(True)
                    syntenic_query_gene_name.append(syn_gene)
                    windanalysis_input_dict[tchr][qchr].append(True)
                except KeyError:
                    syntenic_query_gene_presence_data.append(False)
                    syntenic_query_gene_name.append(".")
                    windanalysis_input_dict[tchr][qchr].append(False)
            rows = [tchr, col1, col2]
            rows.extend(syntenic_query_gene_presence_data)
            rows.extend(syntenic_query_gene_name)
            writer.writerow(rows)



In [ ]:

    
# '''Analysis: for each chromosome in genome1 read the genes on a chromosome and compare to subgenome array of syntenic genes'''

data_output0 = []
data_output1 = []
data_output2 = []
data_output3 = []
output_dict = {}
#Process windows 100genes/sliding window and 
#output to nested dictionary data structure output_dict{target chr:}{query chr}{window count:retention%}
for tchr in windanalysis_input_dict:
    tchr_counter = tchr
    for qchr in windanalysis_input_dict[tchr]:
        counter = 0
        qchr_counter = qchr
        if not tchr in output_dict:
            output_dict[tchr] = {}  #initializes the nested dictionary-primary level at genome1_chromosome
        if not qchr in output_dict[tchr]:
            output_dict[tchr][qchr] = {}  #initializes first nesting in dictionary-second level at genome1_genes

        try:
            if (int(len(windanalysis_input_dict[tchr][qchr]))) >= 100:
                for each in window(windanalysis_input_dict[tchr][qchr], 100):
                    counter += 1
                    data_output2 = sum(each)
                    output_dict[tchr][qchr][counter] = data_output2                       
        except KeyError:
            continue
                            
#Sort output_dict for tchr alphanumberic at top level 
alphanumbsort = lambda k,v: [k, int(v)]
output_dict = collections.OrderedDict(sorted(output_dict.items(), key=lambda t: alphanumbsort(*re.match(r'([a-zA-Z]+)(\d+)',t[0]).groups())))

"""#Output processed data to a csv file for downstream analysis
with open(retention_calc_output_file, 'wb') as csvf:
    headers = ['Target Chromosome', 'Query Chromosome' 'Window Iteration (x-axis)']
    headers.extend(query_lst)
    writer = csv.writer(csvf, dialect='excel', delimiter=',', lineterminator='\n')
    writer.writerow(headers)
    for tchr in windanalysis_input_dict:
        for qchr in windanalysis_input_dict[tchr]:            
            #Prints into two columns
            writer.writerows(izip( output_dict[tchr][qchr]))"""


##Statistics Output NEEDS FIXING

#for tchr in output_dict:
    #for qchr in output_dict[tchr]:
        #print np.mean(output_dict[tchr][qchr])
        #print np.median_grouped(output_dict[tchr][qchr])



In [ ]:

    
%matplotlib inline


#define figure size, column layout, grid layout
figsize = (18 , 60)
cols = 2
gs = gridspec.GridSpec(len(output_dict) // cols + 1, cols)

# These are the "Tableau 20" colors as RGB  
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),  
             (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),  
             (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),  
             (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),  
             (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)] 

tableau10blind = [(0, 107, 164), (255, 128, 14), (171, 171, 171), (89, 89, 89),
             (95, 158, 209), (200, 82, 0), (137, 137, 137), (163, 200, 236),
             (255, 188, 121), (207, 207, 207)]
  
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts  
for i in range(len(tableau20)):  
    r, g, b = tableau20[i]  
    tableau20[i] = (r / 255., g / 255., b / 255.)
    tableau10blind[i]=(r / 255., g / 255., b / 255.)

fig = plt.figure(figsize=figsize)
subplt_count = -1
ax = []
for tchr in output_dict:
    subplt_count += 1
    print "Plotting target chromosome: "+str(subplt_count)
    count = 0 
    row = (subplt_count // cols)
    col = subplt_count % cols
    ax.append(fig.add_subplot(gs[row, col]))
    
    for qchr in output_dict[tchr]:
        count += 1
        if (max(output_dict[tchr][qchr].itervalues()))>0:
            x = output_dict[tchr][qchr].keys()
            y = output_dict[tchr][qchr].values()    
        else:
            continue
        ax[-1].spines["top"].set_visible(False)
        ax[-1].spines["right"].set_visible(False)
        ax[-1].get_xaxis().tick_bottom()
        ax[-1].get_yaxis().tick_left()
        ax[-1].plot(x, y, color=tableau20[count], lw=3)
        ax[-1].set_title(label='Target Chromosome: '+species_name_filter+" "+ tchr, fontweight='bold', fontsize=14)
        ax[-1].set_xlabel('Window Iteration', fontsize=12, fontweight='bold')
        ax[-1].set_ylabel('% Retention', fontsize=12, fontweight='bold')
        if (max(output_dict[tchr][qchr].itervalues()))>0:
            ax[-1].legend(output_dict[tchr], loc=1, frameon=False, title="Query Chromosome", fontsize=10)
        import operator
fig.tight_layout()



In [ ]:



In [ ]:

    
"""Export Processed Fractional Bias Data to JSON File
import json
import csv

with open("myFile.json", "w") as f:
    json.dump(output_dict,f)
    
x = json.loads(output_dict)




f = csv.writer(open("test.csv", "wb+"))

# Write CSV Header, If you dont need that, remove this line
#f.writerow(["pk", "model", "codename", "name", "content_type"])

for x in x:
    f.writerow([x["tchr"], 
                x["tchr"]["qchr"]])
"""



In [ ]:

    
'''import os
from lightning import Lightning

from numpy import random, asarray, arange
from sklearn import datasets
from scipy.ndimage.filters import gaussian_filter
from seaborn import color_palette


# replace with your own host and credentials, e.g. http://localhost:3000 or http://my-lightning-server.herokuapp.com
host = 'http://lightning-docs.herokuapp.com'
auth = (os.getenv('LIGHTNING_USERNAME'), os.getenv('LIGHTNING_PASSWORD'))

lgn = Lightning(ipython=True, host=host, auth=auth)
lgn.create_session('fractbias');'''