In [3]:
from Bio import motifs
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Seq import MutableSeq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC, generic_dna, generic_protein
from collections import defaultdict
import re
import pandas as pd
import numpy as np
import os, sys
In [4]:
#####################
## sys Inputs - to do
#####################
## So I can run in shell and loop through sequences AND motifs to get a giant dataset
## Need to get the alignment to raw sequence
## read in alignment as a list of sequences
# alignment = list(SeqIO.parse(sys.argv[1], "fasta"))
# motif = motifs.read(open(sys.argv[2],"pfm")
alignment = list(SeqIO.parse("../data/fasta/output_ludwig_eve-striped-2.fa", "fasta"))
motif = motifs.read(open("../data/PWM/transpose_fm/bcd_FlyReg.fm"),"pfm")
raw_sequences = []
for record in alignment:
raw_sequences.append(SeqRecord(record.seq.ungap("-"), id = record.id))
In [12]:
##################
## Input 1 - Alignment Input
##################
## Sort alphabetically
alignment = [f for f in sorted(alignment, key=lambda x : x.id)]
## Make all ids lowercase
for record in alignment:
record.id = record.id.lower()
## Check
print("Found %i records in alignment file" % len(alignment))
## Sequence Length should be the same for all alignment sequences
for seq in alignment:
print len(seq)
In [3]:
#####################
## Input 2 - Raw Sequences Input
#####################
print("Found %i records in raw sequence file" % len(raw_sequences))
## make all IUPAC.IUPACUnambiguousDNA()
raw_sequences_2 = []
for seq in raw_sequences:
raw_sequences_2.append(Seq(str(seq.seq), IUPAC.IUPACUnambiguousDNA()))
In [4]:
#####################
## Input 3 - Motif Input
#####################
## motif.weblogo("mymotif.png")
print(motif.counts)
pwm = motif.counts.normalize(pseudocounts=0.0) # Doesn't change from pwm
pssm = pwm.log_odds()
print(pssm) # Why do I need log odds exactly?
motif_length = len(motif) #for later retrival of nucleotide sequence
In [15]:
######################
## Searching for Motifs in Sequences
######################
## Returns a list of arrays with a score for each position
## This give the score for each position
## If you print the length you get the length of the sequence minus TFBS length.
## Forward stand
pssm_list = [ ]
for seq in raw_sequences_2:
pssm_list.append(pssm.calculate(seq))
In [16]:
##########################
## Automatic Calculation of threshold
##########################
## Ideal to find something that automatically calculates, as
## opposed to having human choosing.
## Approximate calculation of appropriate thresholds for motif finding
## Patser Threshold
## It selects such a threshold that the log(fpr)=-ic(M)
## note: the actual patser software uses natural logarithms instead of log_2, so the numbers
## are not directly comparable.
distribution = pssm.distribution(background=motif.background, precision=10**4)
patser_threshold = distribution.threshold_patser() #for use later
print("Patser Threshold is %5.3f" % patser_threshold) # Calculates Paster threshold.
In [17]:
###################################
## Searching for motif in all raw_sequences
#################################
raw_id = []
for seq in raw_sequences:
raw_id.append(seq.id)
record_length = []
for record in raw_sequences_2:
record_length.append(len(record))
position_list = []
for i in range(0,8):
for position, score in pssm.search(raw_sequences_2[i], threshold = patser_threshold):
positions = {'species': raw_id[i], 'score':score, 'position':position, 'seq_len': record_length[i] }
position_list.append(positions)
position_DF = pd.DataFrame(position_list)
In [18]:
#############################
## Add strand and pos position information as columns to position_DF
#############################
position_list_pos = []
for i, x in enumerate(position_DF['position']):
if x < 0:
position_list_pos.append(position_DF.loc[position_DF.index[i], 'seq_len'] + x)
else:
position_list_pos.append(x)
## append to position_DF
position_DF['raw_position'] = position_list_pos
## print position_DF['raw_position']
## strand Column
strand = []
for x in position_DF['position']:
if x < 0:
strand.append("negative")
else:
strand.append("positive")
## append to position_DF
position_DF['strand'] = strand
## motif_found column
## First turn into a list of strings
raw_sequences_2_list = []
for seq in raw_sequences_2:
raw_sequences_2_list.append(str(seq))
## Now get all motifs found in sequences
# motif_found = []
# for x in position_DF['position']:
# motif_found.append(raw_sequences_2_list[i][x:x + motif_length])
## append to position_DF
# position_DF['motif_found'] = motif_found
print position_DF
## Check
## len(motif_found)
## print(motif_found)
## print(position_DF)
In [19]:
##################
## get alignment position
#################
## Need to map to the sequence alignment position
remap_list = []
nuc_list = ['A', 'a', 'G', 'g', 'C', 'c', 'T', 't', 'N', 'n']
positions = {'score':score, 'position':position, 'species': i}
position_list.append(positions)
for i in range(0,9):
counter = 0
for xInd, x in enumerate(alignment[i].seq):
if x in nuc_list:
remaps = {'raw_position': counter, 'align_position':xInd, 'species':alignment[i].id}
counter += 1
remap_list.append(remaps)
remap_DF = pd.DataFrame(remap_list)
## Check
## print_full(remap_DF)
In [24]:
## Merge both datasets
## Check first
## print(position_DF.shape)
## print(remap_DF.shape)
## Merge - all sites
TFBS_map_DF_all = pd.merge(position_DF, remap_DF, on=['species', 'raw_position'], how='outer')
## Sort
TFBS_map_DF_all = TFBS_map_DF_all.sort_values(by=['species','align_position'], ascending=[True, True])
## Check
## print_full(TFBS_map_DF_all)
## print(TFBS_map_DF_all.shape)
# Merge - only signal
TFBS_map_DF_only_signal = pd.merge(position_DF, remap_DF, on=['species', 'raw_position'], how='inner')
TFBS_map_DF_only_signal = TFBS_map_DF_only_signal.sort_values(by=['species','align_position'], ascending=[True, True])
## To quickly check if species share similar TFBS positions
## print_full(TFBS_map_DF_only_signal.sort_values(by=['raw_position'], ascending=[True]))
## Check
## print_full(TFBS_map_DF_only_signal)
## print(TFBS_map_DF_only_signal.shape)
print(TFBS_map_DF_all)
## Write out Files
## TFBS_map_DF_all.to_csv('../data/outputs/TFBS_map_DF_all_bicoid_test.csv', sep='\t', na_rep="NA")
In [21]:
###############################
## Print binary info for each species
################################
# Create new column, 1 if TFBS is present, 0 if absent
TFBS_map_DF_all['presence'] = 0 # For some reason you have to initaite the column first.
TFBS_map_DF_all['presence'] = TFBS_map_DF_all.apply(lambda x: x.notnull(), axis=1)
TFBS_map_DF_all['presence'] = TFBS_map_DF_all['presence'].astype(int)
## Check
## print_full(TFBS_map_DF_all)
## Create new dataframe
## Check First
## list(TFBS_map_DF_all.columns)
TFBS_map_DF_binary = TFBS_map_DF_all[['species', 'presence', 'align_position', 'strand']].copy()
## Subset on strand
TFBS_map_DF_binary_positive = TFBS_map_DF_binary['strand'] == "positive"
TFBS_map_DF_binary_negative = TFBS_map_DF_binary['strand'] == "negative"
## Check
print(TFBS_map_DF_binary)
## Now long to wide
## [ ] NaNs are introduced here and not sure why and it is super annoying.
## - Maybe it has something to do with the negative and positive strands. These should be subsetted first.
## There should be two presense and absence...or maybe a 2 presented? This get back to the range issue.
## We should have the 1s maybe represent TFBS range, not just starting position.
TFBS_map_DF_binary = TFBS_map_DF_binary.pivot_table(index='species', columns='align_position', values='presence')
print(TFBS_map_DF_binary.iloc[:,6:40])
In [22]:
####################
## Attach input files name as a column
#####################
## Ideally I would attach the file name of the 1. raw sequence and 2. the motif being tested