In [1]:
from pkg_resources import resource_filename
data_dir = resource_filename("gimmemotifs", "../data/examples")
%cd {data_dir}
In [2]:
from gimmemotifs.motif import Motif,read_motifs
# Read from file
motifs = read_motifs("example.pfm")
for motif in motifs:
print(motif)
In [3]:
# Read from file to a dictionary
motifs = read_motifs("example.pfm", as_dict=True)
print(motifs)
In [4]:
# Read any motif database included with gimmemotifs by name
motifs = read_motifs("JASPAR2018_vertebrates")
print(len(motifs))
print(motifs[0].to_pwm())
In [5]:
# Create from scratch
m = Motif([[0,1,0,0],[0,0,1,0]])
m.id = "CpG"
print(m)
In [6]:
# Or from a consensus sequence
from gimmemotifs.motif import motif_from_consensus
ap1 = motif_from_consensus("TGASTCA")
print(ap1.to_pwm())
Read motifs from files in other formats.
In [7]:
with open("MA0099.3.jaspar") as f:
motifs = read_motifs(f, fmt="jaspar")
print(motifs[0])
You can convert a motif to several formats.
In [8]:
with open("example.pfm") as f:
motifs = read_motifs(f)
# pwm
print(motifs[0].to_pwm())
In [9]:
# pfm
print(motifs[0].to_pfm())
In [10]:
# consensus sequence
print(motifs[0].to_consensus())
In [11]:
# TRANSFAC
print(motifs[0].to_transfac())
In [12]:
# MEME
print(motifs[0].to_meme())
Some other useful tidbits.
In [13]:
m = motif_from_consensus("NTGASTCAN")
print(len(m))
In [14]:
# Trim by information content
m.trim(0.5)
print(m.to_consensus(), len(m))
In [15]:
# Slices
print(m[:3].to_consensus())
In [16]:
# Shuffle
random_motif = motif_from_consensus("NTGASTGAN").randomize()
print(random_motif)
To convert a motif to an image, use to_img()
. Supported formats are png, ps and pdf.
In [17]:
m = motif_from_consensus("NTGASTCAN")
m.to_img("ap1.png", fmt="png")
In [18]:
from IPython.display import Image
Image("ap1.png")
Out[18]:
In [19]:
from gimmemotifs.motif import motif_from_consensus
from gimmemotifs.fasta import Fasta
f = Fasta("test.small.fa")
m = motif_from_consensus("TGAsTCA")
m.pwm_scan(f)
Out[19]:
This return a dictionary with the sequence names as keys. The value is a list with positions where the motif matches. Here, as the AP1 motif is a palindrome, you see matches on both forward and reverse strand. This is more clear when we use pwm_scan_all()
that returns position, score and strand for every match.
In [20]:
m.pwm_scan_all(f)
Out[20]:
The number of matches to return is set to 50 by default, you can control this by setting the nreport
argument. Use scan_rc=False
to only scan the forward orientation.
In [21]:
m.pwm_scan_all(f, nreport=1, scan_rc=False)
Out[21]:
While this functionality works, it is not very efficient. To scan many motifs in potentially many sequences, use the functionality in the scanner
module. If you only want the best match per sequence, there is a utility function called scan_to_best_match
, otherwise, use the Scanner
class.
In [22]:
from gimmemotifs.motif import motif_from_consensus
from gimmemotifs.scanner import scan_to_best_match
m1 = motif_from_consensus("TGAsTCA")
m1.id = "AP1"
m2 = motif_from_consensus("CGCG")
m2.id = "CG"
motifs = [m1, m2]
print("motif\tpos\tscore")
result = scan_to_best_match("test.small.fa", motifs)
for motif, matches in result.items():
for match in matches:
print("{}\t{}\t{}".format(motif, match[1], match[0]))
The matches are in the same order as the sequences in the original file.
While this function can be very useful, a Scanner instance is much more flexible. You can scan different input formats (BED, FASTA, regions), and control the thresholds and output.
As an example we will use the file Gm12878.CTCF.top500.w200.fa
that contains 500 top CTCF peaks. We will get the CTCF motif and scan this file in a number of different ways.
In [24]:
from gimmemotifs.motif import default_motifs
from gimmemotifs.scanner import Scanner
from gimmemotifs.fasta import Fasta
import numpy as np
# Input file
fname = "Gm12878.CTCF.top500.w200.fa"
# Select the CTCF motif from the default motif database
motifs = [m for m in default_motifs() if "CTCF" in m.factors['direct']][:1]
# Initialize the scanner
s = Scanner()
s.set_motifs(motifs)
Now let’s get the best score for the CTCF motif for each sequence.
In [25]:
scores = [r[0] for r in s.best_score("Gm12878.CTCF.top500.w200.fa")]
print("{}\t{:.2f}\t{:.2f}\t{:.2f}".format(
len(scores),
np.mean(scores),
np.min(scores),
np.max(scores)
))
In many cases you’ll want to set a threshold. In this example we’ll use a 1% FPR threshold, based on scanning randomly selected sequences from the hg38 genome. The first time you run this, it will take a while. However, the tresholds will be cached. This means that for the same combination of motifs and genome, the previously generated threshold will be used.
In [26]:
# Set a 1% FPR threshold based on random hg38 sequence
s.set_genome("hg38")
s.set_threshold(fpr=0.01)
# get the number of sequences with at least one match
counts = [n[0] for n in s.count("Gm12878.CTCF.top500.w200.fa", nreport=1)]
print(counts[:10])
In [27]:
# or the grand total of number of sequences with 1 match
print(s.total_count("Gm12878.CTCF.top500.w200.fa", nreport=1))
In [28]:
# Scanner.scan() just gives all information
seqs = Fasta("Gm12878.CTCF.top500.w200.fa")[:10]
for i,result in enumerate(s.scan(seqs)):
seqname = seqs.ids[i]
for m,matches in enumerate(result):
motif = motifs[m]
for score, pos, strand in matches:
print(seqname, motif, score, pos, strand)
In [29]:
from gimmemotifs.denovo import gimme_motifs
peaks = "Gm12878.CTCF.top500.w200.fa"
outdir = "CTCF.gimme"
params = {
"tools": "Homer,BioProspector",
"genome": "hg38",
}
motifs = gimme_motifs(peaks, outdir, params=params)
This will basically run the same pipeline as the gimme motifs
command. All output files will be stored in outdir and gimme_motifs
returns a list
of Motif
instances. If you only need the motifs but not the graphical report, you can decide to skip it by setting create_report
to False
. Additionally, you can choose to skip clustering (cluster=False
) or to skip calculation of significance (filter_significant=False
). For instance, the following command will only predict motifs and cluster them.
In [30]:
motifs = gimme_motifs(peaks, outdir,
params=params, filter_significant=False, create_report=False)
All parameters for motif finding are set by the params
argument
Although the gimme_motifs
function is probably the easiest way to run the de novo
finding tools, you can also run any of the tools directly. In this case you would also have to supply the background file if the specific tool requires it.
In [31]:
from gimmemotifs.tools import get_tool
from gimmemotifs.background import MatchedGcFasta
m = get_tool("homer") # tool name is case-insensitive
# Create a background fasta file with a similar GC%
fa = MatchedGcFasta("TAp73alpha.fa", number=1000)
fa.writefasta("bg.fa")
# Run motif prediction
params = {
"background": "bg.fa",
"width": "20",
"number": 5,
}
motifs, stdout, stderr = m.run("TAp73alpha.fa", params=params)
print(motifs[0].to_consensus())
In [32]:
from gimmemotifs.background import MatchedGcFasta
from gimmemotifs.fasta import Fasta
from gimmemotifs.stats import calc_stats
from gimmemotifs.motif import default_motifs
sample = "TAp73alpha.fa"
bg = MatchedGcFasta(sample, genome="hg19", number=1000)
motifs = [m for m in default_motifs() if any(f in m.factors['direct'] for f in ["TP53", "TP63", "TP73"])]
stats = calc_stats(motifs, sample, bg)
print("Stats for", motifs[0])
for k, v in stats[str(motifs[0])].items():
print(k,v)
print()
best_motif = sorted(motifs, key=lambda x: stats[str(x)]["recall_at_fdr"])[-1]
print("Best motif (recall at 10% FDR):", best_motif)
A lot of statistics are generated and you will not always need all of them. You can choose one or more specific metrics with the additional stats
argument.
In [33]:
metrics = ["roc_auc", "recall_at_fdr"]
stats = calc_stats(motifs, sample, bg, stats=metrics)
for metric in metrics:
for motif in motifs:
print("{}\t{}\t{:.2f}".format(
motif.id, metric, stats[str(motif)][metric]
))
In [34]:
from gimmemotifs.comparison import MotifComparer
from gimmemotifs.motif import motif_from_consensus
from gimmemotifs.motif import read_motifs
Compare two motifs
In [36]:
m1 = motif_from_consensus("RRRCATGYYY")
m2 = motif_from_consensus("TCRTGT")
mc = MotifComparer()
score, pos, orient = mc.compare_motifs(m1, m2)
if orient == -1:
m2 = m2.rc()
pad1, pad2 = "", ""
if pos < 0:
pad1 = " " * -pos
elif pos > 0:
pad2 =" " * pos
print(pad1 + m1.to_consensus())
print(pad2 + m2.to_consensus())
Find closest match in a motif database
In [37]:
motifs = [
motif_from_consensus("GATA"),
motif_from_consensus("NTATAWA"),
motif_from_consensus("ACGCG"),
]
mc = MotifComparer()
results = mc.get_closest_match(motifs, dbmotifs=read_motifs("HOMER"), metric="seqcor")
# Load motifs
db = read_motifs("HOMER", as_dict=True)
for motif in motifs:
match, scores = results[motif.id]
print("{}: {} - {:.3f}".format(motif.id, match, scores[0]))
dbmotif = db[match]
orient = scores[2]
if orient == -1:
dbmotif = dbmotif.rc()
padm, padd = 0, 0
if scores[1] < 0:
padm = -scores[1]
elif scores[1] > 0:
padd = scores[1]
print(" " * padm + motif.to_consensus())
print(" " * padd + dbmotif.to_consensus())
print()
In [ ]:
In [ ]: