In [90]:
from __future__ import print_function
from collections import defaultdict
from random import random
import pysam
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pysam

In [88]:
class Edge:
    #directed edge from->to direction
    def __init__(self, fid, tid):
        self.f = fid
        self.t = tid

class State:
    def __init__(self):
        self.N = 0
        self.path = []
        
    def __init__(self, N, collapse_path):
        self.N = N
        self.path = collapse_path

def get_collapse_prob(f1, f2):
    #Tom is working on better estimation
    return (f2-f1)/float(f2)

def coin_toss(p):
    #returns true if within (0-p]
    rn = random()
    return True if rn <= p else False

def get_freq(node, path, freqDist):
    # traverse the path and get frequency of a node 
    # after collapsing
    count = freqDist[node]
    for edge in path:
        from_node = edge.f
        to_node = edge.t
        if to_node == node:
            count += freqDist[from_node]
    return count

def check_collapse_possibility(state, query_edge):
    # checks if it is possible to collapse an edge
    collapsed_nodes = set([])
    for edge in state.path:
        collapsed_nodes.add(edge.f)
        
    if query_edge.f in collapsed_nodes:
        return False
    else:
        return True

def generate_next(curr_state, edge, freqDist):
    #walk through the path and get the frequqncy of the nodes
    from_edge_freq = get_freq(edge.f, curr_state.path, freqDist)
    to_edge_freq = get_freq(edge.t, curr_state.path, freqDist)
    
    #given updated frequqncies after collapsing -> get probability to collapse
    collapse_prob = get_collapse_prob(from_edge_freq, to_edge_freq)
    
    #toss a biased coin
    is_collapsing = coin_toss(collapse_prob)
    
    if (is_collapsing):
        is_possible = check_collapse_possibility(curr_state, edge)
        if is_possible and edge not in curr_state.path:
            curr_state.N -= 1
            curr_state.path.add(edge)
    else:
        if edge in curr_state.path:
            curr_state.N += 1
            curr_state.path.remove(edge)
    return curr_state

def parse_graph(fname):
    #structure of graph file
    #number of nodes
    #frequency of each node in new line
    #one edge each line: <from>,<to> directional edge
    freqDist = [None]
    adj_matrix = defaultdict(set)
    edges = []
    with open(fname) as gf:
        N = int(gf.readline().strip())
        print("Total {} nodes found".format(N))
        for n in range(N):
            freqDist.append(int(gf.readline()))
        print ("Done importing frequency")
        for edge in gf:
            f,t = [int(n) for n in edge.strip().split(",")]
            adj_matrix[f].add(t)
            edges.append(Edge(f, t))
        print ("Done importing edges")
        return (adj_matrix, freqDist, edges)

def print_state(state):
    print ("no of molecules: {}".format(state.N))
    print ("Path Collapsed:")
    for edge in state.path:
        print ("({},{}),".format(edge.f, edge.t))
        
def get_dist(fname, num_gibbs, N):
    adj_matrix, freqDist, edges = parse_graph(fname)
    molecule_dist = defaultdict(int)

    #start from all separated as inital state
    #According to MCMC shouldn't matter where you start
    curr_state = State(N, set([]))
    for x in range(num_gibbs):
        #one gibbs cycle
        for edge in edges:
            curr_state = generate_next(curr_state, edge, freqDist)
        num_molecules = curr_state.N
        molecule_dist[num_molecules] += 1

    return molecule_dist

# Dumped all reads mapping to ENSG00000248527.1 gene
# Create the graph and run mcmc
def parse_sam(fname):
    umi_dist = defaultdict(lambda : defaultdict(int))
    samfile = pysam.AlignmentFile(fname, 'rb')
    count = 0
    for read in samfile:
        count += 1
        if count%10000000==0:
            print ("\r Done {}".format(count)),
        if read.has_tag('XT'):
            toks = read.qname.strip().split('_')
            umi = toks[-1]
            bc = toks[-2]
            gene = read.get_tag('XT').strip()
            umi_dist[gene][(bc,umi)] += 1
    return umi_dist

In [30]:
#100, 89, 95
get_dist("./graph.txt", 3000, 3)


Total 5 nodes found
Done importing frequency
Done importing edges
Out[30]:
defaultdict(int, {0: 741, 1: 1288, 2: 854, 3: 117})

In [31]:
#100, 45, 40
get_dist("./graph.txt", 3000, 3)


Total 5 nodes found
Done importing frequency
Done importing edges
Out[31]:
defaultdict(int, {0: 718, 1: 1257, 2: 889, 3: 136})

In [32]:
get_dist("./graph.txt", 500, 5)


Total 5 nodes found
Done importing frequency
Done importing edges
Out[32]:
defaultdict(int, {2: 136, 3: 200, 4: 139, 5: 25})

In [ ]:


In [86]:
umi_dist = parse_sam("/mnt/scratch5/avi/alevin/data/10x/mohu/92p/out/Aligned.sortedByCoord.out.bam.featureCounts.bam")


 Done 1000000
 Done 2000000
 Done 3000000
 Done 4000000
 Done 5000000
 Done 6000000
 Done 7000000
 Done 8000000
 Done 9000000
 Done 10000000
 Done 11000000
 Done 12000000
 Done 13000000
 Done 14000000
 Done 15000000
 Done 16000000
 Done 17000000
 Done 18000000
 Done 19000000
 Done 20000000
 Done 21000000
 Done 22000000
 Done 23000000
 Done 24000000
 Done 25000000
 Done 26000000
 Done 27000000
 Done 28000000
 Done 29000000
 Done 30000000
 Done 31000000
 Done 32000000
 Done 33000000
 Done 34000000
 Done 35000000
 Done 36000000
 Done 37000000
 Done 38000000
 Done 39000000
 Done 40000000
 Done 41000000
 Done 42000000
 Done 43000000

In [87]:
ha = []
for _,k in umi_dist.items():
    if len(k)!=0:
        ha += k.values()

In [81]:



Out[81]:
56214427

In [85]:
len(ha)
plt.hist(ha, bins=100)


Out[85]:
(array([  4.44239270e+07,   7.61748500e+06,   2.24618300e+06,
          9.22031000e+05,   4.60331000e+05,   2.71935000e+05,
          1.89623000e+05,   5.56990000e+04,   1.97380000e+04,
          5.76900000e+03,   1.35200000e+03,   2.65000000e+02,
          5.90000000e+01,   1.40000000e+01,   5.00000000e+00,
          2.00000000e+00,   2.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   2.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00]),
 array([   1.  ,    4.15,    7.3 ,   10.45,   13.6 ,   16.75,   19.9 ,
          23.05,   26.2 ,   29.35,   32.5 ,   35.65,   38.8 ,   41.95,
          45.1 ,   48.25,   51.4 ,   54.55,   57.7 ,   60.85,   64.  ,
          67.15,   70.3 ,   73.45,   76.6 ,   79.75,   82.9 ,   86.05,
          89.2 ,   92.35,   95.5 ,   98.65,  101.8 ,  104.95,  108.1 ,
         111.25,  114.4 ,  117.55,  120.7 ,  123.85,  127.  ,  130.15,
         133.3 ,  136.45,  139.6 ,  142.75,  145.9 ,  149.05,  152.2 ,
         155.35,  158.5 ,  161.65,  164.8 ,  167.95,  171.1 ,  174.25,
         177.4 ,  180.55,  183.7 ,  186.85,  190.  ,  193.15,  196.3 ,
         199.45,  202.6 ,  205.75,  208.9 ,  212.05,  215.2 ,  218.35,
         221.5 ,  224.65,  227.8 ,  230.95,  234.1 ,  237.25,  240.4 ,
         243.55,  246.7 ,  249.85,  253.  ,  256.15,  259.3 ,  262.45,
         265.6 ,  268.75,  271.9 ,  275.05,  278.2 ,  281.35,  284.5 ,
         287.65,  290.8 ,  293.95,  297.1 ,  300.25,  303.4 ,  306.55,
         309.7 ,  312.85,  316.  ]),
 <a list of 100 Patch objects>)

In [91]:
len(ha)
plt.hist(ha, bins=100)


Out[91]:
(array([  4.97706600e+06,   2.11818700e+06,   1.20503700e+06,
          0.00000000e+00,   7.70054000e+05,   5.29120000e+05,
          0.00000000e+00,   3.84941000e+05,   2.93194000e+05,
          0.00000000e+00,   2.28753000e+05,   1.81196000e+05,
          0.00000000e+00,   1.45507000e+05,   1.18788000e+05,
          0.00000000e+00,   9.68390000e+04,   7.90380000e+04,
          0.00000000e+00,   6.45040000e+04,   5.28460000e+04,
          0.00000000e+00,   4.31280000e+04,   3.55800000e+04,
          0.00000000e+00,   2.93900000e+04,   2.38140000e+04,
          0.00000000e+00,   1.89250000e+04,   1.51870000e+04,
          0.00000000e+00,   1.21400000e+04,   9.51600000e+03,
          0.00000000e+00,   7.57100000e+03,   5.87500000e+03,
          0.00000000e+00,   4.55400000e+03,   3.49400000e+03,
          0.00000000e+00,   2.75200000e+03,   2.08300000e+03,
          0.00000000e+00,   1.54100000e+03,   1.26900000e+03,
          0.00000000e+00,   9.05000000e+02,   6.85000000e+02,
          0.00000000e+00,   5.17000000e+02,   4.35000000e+02,
          0.00000000e+00,   3.35000000e+02,   2.33000000e+02,
          0.00000000e+00,   1.98000000e+02,   1.60000000e+02,
          0.00000000e+00,   1.29000000e+02,   1.02000000e+02,
          0.00000000e+00,   8.70000000e+01,   6.30000000e+01,
          0.00000000e+00,   6.20000000e+01,   4.80000000e+01,
          0.00000000e+00,   4.90000000e+01,   2.60000000e+01,
          0.00000000e+00,   2.60000000e+01,   2.20000000e+01,
          0.00000000e+00,   1.20000000e+01,   1.40000000e+01,
          0.00000000e+00,   1.50000000e+01,   1.40000000e+01,
          0.00000000e+00,   5.00000000e+00,   9.00000000e+00,
          0.00000000e+00,   2.00000000e+00,   3.00000000e+00,
          0.00000000e+00,   4.00000000e+00,   4.00000000e+00,
          0.00000000e+00,   3.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   2.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00]),
 array([  1.  ,   1.67,   2.34,   3.01,   3.68,   4.35,   5.02,   5.69,
          6.36,   7.03,   7.7 ,   8.37,   9.04,   9.71,  10.38,  11.05,
         11.72,  12.39,  13.06,  13.73,  14.4 ,  15.07,  15.74,  16.41,
         17.08,  17.75,  18.42,  19.09,  19.76,  20.43,  21.1 ,  21.77,
         22.44,  23.11,  23.78,  24.45,  25.12,  25.79,  26.46,  27.13,
         27.8 ,  28.47,  29.14,  29.81,  30.48,  31.15,  31.82,  32.49,
         33.16,  33.83,  34.5 ,  35.17,  35.84,  36.51,  37.18,  37.85,
         38.52,  39.19,  39.86,  40.53,  41.2 ,  41.87,  42.54,  43.21,
         43.88,  44.55,  45.22,  45.89,  46.56,  47.23,  47.9 ,  48.57,
         49.24,  49.91,  50.58,  51.25,  51.92,  52.59,  53.26,  53.93,
         54.6 ,  55.27,  55.94,  56.61,  57.28,  57.95,  58.62,  59.29,
         59.96,  60.63,  61.3 ,  61.97,  62.64,  63.31,  63.98,  64.65,
         65.32,  65.99,  66.66,  67.33,  68.  ]),
 <a list of 100 Patch objects>)

In [ ]: