In [8]:
from Bio import motifs
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC, generic_dna, generic_protein
import re
import pandas as pd
import numpy as np
import os, sys
In [9]:
## Input 1 - Alignment Input
## read in alignment as a list of sequences
alignment = list(SeqIO.parse("../data/fasta/outputgi_ludwig_eve-striped-2.fa", "fasta"))
## Check
print("Found %i records in alignment file" % len(alignment))
## Turn sequences into a list of strings
## They are no longer bio.seq.seq objects though
alignment_string_list = []
for seq in alignment:
alignment_string_list.append(str(seq.seq))
In [21]:
## Input 2 - Raw Sequences Input
raw_sequences = list(SeqIO.parse("../data/fasta/ludwig_eve-striped-2.fasta", "fasta"))
print("Found %i records in raw sequence file" % len(raw_sequences))
## make all IUPAC.IUPACUnambiguousDNA()
raw_sequences_2 = []
for seq in raw_sequences:
raw_sequences_2.append(Seq(str(seq.seq), IUPAC.IUPACUnambiguousDNA()))
In [11]:
## Input 3 - Motif Input
## [ ] This is where I need to loop through all PWMs in a file
bcd = motifs.read(open("../data/PWM/transpose_fm/bcd_FlyReg.fm"),"pfm")
print(bcd.counts)
pwm = bcd.counts.normalize(pseudocounts=0.0)
pssm = pwm.log_odds()
In [19]:
## Searching the Motifs in Sequences
## Returns a list of arrays with a score for each position
pssm_list = [ ]
for seq in raw_sequences_2:
pssm_list.append(pssm.calculate(seq))
## Approximate calculation of appropriate thresholds for motif finding
## Patser Threshold
## It selects such a threshold that the log(fpr)=-ic(M)
## note: the actual patser software uses natural logarithms instead of log_2, so the numbers
## are not directly comparable.
distribution = pssm.distribution(background=bcd.background, precision=10**4)
patser_threshold = distribution.threshold_patser()
print("Patser Threshold is %5.3f" % patser_threshold) # Calculates Paster threshold.
In [20]:
###################################
## [ ] Need to reiterate over raw_sequences_2
## [ ] When reiterating over raw_sequences_2, attach id
#################################
## This just gets all the scores for the chance of bcd in each position of the raw sequences
score_list = []
position_list = []
for position, score in pssm.search(raw_sequences_2[0], threshold= patser_threshold):
position_list.append(position)
score_list.append(score)
## Change position to positive
position_list_pos = []
for x in position_list:
if x < 0:
position_list_pos.append(905 + x)
else:
position_list_pos.append(x)
## Check
## print(position_list_pos)
strand = []
for x in position_list:
if x < 0:
strand.append("negative")
else:
strand.append("positive")
In [14]:
## get alignment position using `alignment_string_list`
remap_dict = {}
nuc_list = ['A', 'a', 'G', 'g', 'C', 'c', 'T', 't', 'N', 'n']
counter = 0
#######################
## [ ] Reiterate through all species?
## [ ] maybe create a list of dictionaries?
#######################
for xInd, x in enumerate(alignment_string_list[1]):
if x in nuc_list:
remap_dict[counter] = xInd
counter += 1
## Check
## print(remap_dict)
## Now find the value from the key??? Find the alignment posititon from raw position
align_pos = [remap_dict[x] for x in position_list_pos]
## check
print(align_pos)
print(alignment_id[1])
print(type(alignment_id[1]))
In [15]:
## Make dataframe that has everything
pos_df = pd.DataFrame(
{'raw_position': position_list,
'raw_position_pos_only': position_list_pos,
'alignment_position':position_list_pos,
'strand_direction': strand,
'score': score_list,
'species': alignment_id[1]
})
#############
## [ ] this needs to be a value for species!!
## like pos_df = alignment_id[1]
############
## pos_df['species'] = alignment_id[1]
print(pos_df)
In [ ]: