In [90]:
from __future__ import print_function
from collections import defaultdict
from random import random
import pysam
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pysam
In [88]:
class Edge:
#directed edge from->to direction
def __init__(self, fid, tid):
self.f = fid
self.t = tid
class State:
def __init__(self):
self.N = 0
self.path = []
def __init__(self, N, collapse_path):
self.N = N
self.path = collapse_path
def get_collapse_prob(f1, f2):
#Tom is working on better estimation
return (f2-f1)/float(f2)
def coin_toss(p):
#returns true if within (0-p]
rn = random()
return True if rn <= p else False
def get_freq(node, path, freqDist):
# traverse the path and get frequency of a node
# after collapsing
count = freqDist[node]
for edge in path:
from_node = edge.f
to_node = edge.t
if to_node == node:
count += freqDist[from_node]
return count
def check_collapse_possibility(state, query_edge):
# checks if it is possible to collapse an edge
collapsed_nodes = set([])
for edge in state.path:
collapsed_nodes.add(edge.f)
if query_edge.f in collapsed_nodes:
return False
else:
return True
def generate_next(curr_state, edge, freqDist):
#walk through the path and get the frequqncy of the nodes
from_edge_freq = get_freq(edge.f, curr_state.path, freqDist)
to_edge_freq = get_freq(edge.t, curr_state.path, freqDist)
#given updated frequqncies after collapsing -> get probability to collapse
collapse_prob = get_collapse_prob(from_edge_freq, to_edge_freq)
#toss a biased coin
is_collapsing = coin_toss(collapse_prob)
if (is_collapsing):
is_possible = check_collapse_possibility(curr_state, edge)
if is_possible and edge not in curr_state.path:
curr_state.N -= 1
curr_state.path.add(edge)
else:
if edge in curr_state.path:
curr_state.N += 1
curr_state.path.remove(edge)
return curr_state
def parse_graph(fname):
#structure of graph file
#number of nodes
#frequency of each node in new line
#one edge each line: <from>,<to> directional edge
freqDist = [None]
adj_matrix = defaultdict(set)
edges = []
with open(fname) as gf:
N = int(gf.readline().strip())
print("Total {} nodes found".format(N))
for n in range(N):
freqDist.append(int(gf.readline()))
print ("Done importing frequency")
for edge in gf:
f,t = [int(n) for n in edge.strip().split(",")]
adj_matrix[f].add(t)
edges.append(Edge(f, t))
print ("Done importing edges")
return (adj_matrix, freqDist, edges)
def print_state(state):
print ("no of molecules: {}".format(state.N))
print ("Path Collapsed:")
for edge in state.path:
print ("({},{}),".format(edge.f, edge.t))
def get_dist(fname, num_gibbs, N):
adj_matrix, freqDist, edges = parse_graph(fname)
molecule_dist = defaultdict(int)
#start from all separated as inital state
#According to MCMC shouldn't matter where you start
curr_state = State(N, set([]))
for x in range(num_gibbs):
#one gibbs cycle
for edge in edges:
curr_state = generate_next(curr_state, edge, freqDist)
num_molecules = curr_state.N
molecule_dist[num_molecules] += 1
return molecule_dist
# Dumped all reads mapping to ENSG00000248527.1 gene
# Create the graph and run mcmc
def parse_sam(fname):
umi_dist = defaultdict(lambda : defaultdict(int))
samfile = pysam.AlignmentFile(fname, 'rb')
count = 0
for read in samfile:
count += 1
if count%10000000==0:
print ("\r Done {}".format(count)),
if read.has_tag('XT'):
toks = read.qname.strip().split('_')
umi = toks[-1]
bc = toks[-2]
gene = read.get_tag('XT').strip()
umi_dist[gene][(bc,umi)] += 1
return umi_dist
In [30]:
#100, 89, 95
get_dist("./graph.txt", 3000, 3)
Out[30]:
In [31]:
#100, 45, 40
get_dist("./graph.txt", 3000, 3)
Out[31]:
In [32]:
get_dist("./graph.txt", 500, 5)
Out[32]:
In [ ]:
In [86]:
umi_dist = parse_sam("/mnt/scratch5/avi/alevin/data/10x/mohu/92p/out/Aligned.sortedByCoord.out.bam.featureCounts.bam")
In [87]:
ha = []
for _,k in umi_dist.items():
if len(k)!=0:
ha += k.values()
In [81]:
Out[81]:
In [85]:
len(ha)
plt.hist(ha, bins=100)
Out[85]:
In [91]:
len(ha)
plt.hist(ha, bins=100)
Out[91]:
In [ ]: