Tethered Bead Data Analysis

Import necessary packages.


In [1]:
from __future__ import division, unicode_literals, print_function
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame, Series, HDFStore
import beadpy
import parameters as params

Set plotting preferences.


In [2]:
%matplotlib notebook
mpl.rc('figure',  figsize=(8, 5))
mpl.rc('image', cmap='gray')
#mpl.rc('figure', edgecolor='black')

View parameter settings.


In [3]:
fname = 'parameters.py'
with open(fname, 'r') as fin:
    print(fin.read())


sequencelocation = str('G:/Flynn/20170307LCoreChewPreassembleNoWash/13-24-53.804.seq')
framecut = 10
method = 'parallel' #Set method to 'streaming' or 'parallel'. If parallel, remember to start the ipcluster.
exposuretime = 1 #seconds per frame.
ntmicron = 2805 #Nucleotides per micron (use 2805 for 2.8 micron beads at 15 ul/min in the standard flow cell).
#micronpixel = 0.8 # Microns per pixel.
mintrajlength = 500 #Minimum trajectory duration, in frames.
minrate = 1 #Minimum rate segments to display in segments plot, and also for event filtering.
maxrate = 25 #Maximum rate to display in segments plot, and also for event filtering and the rate plot.
mindisplacement = 1000 #minimum displacement for event filtering.
starttime = 200 #Minimum start time (seconds) for event filtering.
endtime = 15000 #Maximum end time (seconds) for event filtering (normally just leave as a high number).
coarsesigma = 500 #Coarse grained sigma value for event identification and scatter plot.
finesigma = 500 #Fine grained sigma value for rate distribution.
ymaxscatter = 15000 #y max value for the segment scatter plot.
yminscatter = -6000 #y min value for the segment scatter plot.
ratebins = 10 #Number of bins in the rate histogram.
binning = 1
if binning == 2:
	ymincrop = 1#150
	ymaxcrop = 1950
	xmincrop = 1
	xmaxcrop = 3276
	beadsize = 9
	linklength = 1
	micronpixel = 1.6
elif binning == 1:
	ymincrop = 1
	ymaxcrop = 3790
	xmincrop = 1
	xmaxcrop = 6568
	beadsize = 15
	linklength = 2
	micronpixel = 0.8
	
ntconversion = ntmicron * micronpixel

Set up the parallel processing client, if needed.


In [8]:
if params.method == 'parallel':
    import ipyparallel
    from ipyparallel import Client
    client = Client()
    view = client.load_balanced_view()
    %%px
    import trackpy as tp
    len(client)

Open the image sequence, crop each image and do a bandpass filter (not applied until the image is called).


In [12]:
import pims
import trackpy as tp

def crop(img):
    x_min = params.xmincrop
    x_max = params.xmaxcrop
    y_min = params.ymincrop
    y_max = params.ymaxcrop
    as_grey=True
    img = tp.bandpass(img, lshort=1, llong = 4)
    return img[y_min:y_max,x_min:x_max]
frames = pims.open(params.sequencelocation,process_func=crop)

In [13]:
frames[0]


Out[13]:

Look for the beads in the first frame.


In [14]:
f = tp.locate(frames[0], 15, invert=False, 
              minmass = 800, 
              separation = 5,
              max_iterations=5, 
              characterize=False, 
              engine='numba', 
              preprocess=False)

In [15]:
plt.figure()  # make a new figure
tp.annotate(f, frames[0]);
#Zoom in and check that it looks reasonable.



In [16]:
fig, ax = plt.subplots()
ax.hist(f['mass'], bins=50)

# Optionally, label the axes.
ax.set(xlabel='mass', ylabel='count');



In [17]:
len(f)


Out[17]:
6484

Inspect the above histogram and adjust the minimum mass value as desired for the bead locating step below.

Locate all beads in all frames, in either parallel or sequential mode (which streams from an hdf5 file and uses a single core).


In [ ]:
if params.method == 'streaming':
   with tp.PandasHDFStore('results.h5') as s:
      for image in frames[:-params.framecut]:
         features = tp.locate(image, 
            15,
            invert=False, 
            minmass = 800, 
            separation = 5,
            max_iterations=5, 
            characterize=False, 
            engine='numba', 
            preprocess=False)
         s.put(features)
elif params.method == 'parallel':
   curried_locate = lambda image: __import__('trackpy').locate(image, 
            15, 
            invert=False, 
            minmass = 800, 
            separation = 5,
            max_iterations=5, 
            characterize=False, 
            engine='numba', 
            preprocess=False)
   amr = view.map_async(curried_locate, frames[:-params.framecut])
   amr.wait_interactive()
   rtemp = amr.get()
   results = pd.concat(rtemp)
   del rtemp
   with tp.PandasHDFStoreBig('results.h5') as s:
      for frameno in results.frame.unique():
         frameres = results[results.frame == frameno]
         s.put(frameres)
   del results

Link the beads into trajectories.


In [ ]:
with tp.PandasHDFStoreBig('results.h5') as s:
	for linked in tp.link_df_iter(s, 1,memory =1, neighbor_strategy='KDTree'):
		s.put(linked)

Read the hdf5 file into memory as a Pandas Data Frame.


In [21]:
with tp.PandasHDFStoreBig('results.h5') as s:
	results = s.dump()

Optional: Flip the coordinates in the 'x' or 'y' direction:


In [ ]:
#results = beadpy.flip_coordinates(results, 'x')

Remove brief trajectories.


In [22]:
results = tp.filter_stubs(results, 150)

Subtract drift. This works by taking all the trajectories which endure from start to finish, centring these at zero, and then taking the median x and y positions at each timepoint. These are then boxcar smoothed to create a drift trajectory which is subsequently subtracted from each trajectory in the results table.


In [24]:
results = beadpy.drift_subtractor(results, params.exposuretime)


Unit conversion, removal of spurious trajectories (those which do not endure past a certain timepoint), baselining, trajectory renumber.


In [25]:
results = beadpy.unit_converter(results, params.exposuretime, params.ntconversion, params.micronpixel)

In [26]:
results = beadpy.spurious_removal(results)

In [27]:
results = beadpy.baseline(results, params.exposuretime)

In [28]:
results = beadpy.trajectory_renumber(results)

Save the results table.


In [29]:
results.to_csv('pythonresults.csv', index = False, float_format='%.4f')

Changepoint Analysis

This uses the segment_finder function. Options are:

  • datatable

  • xcolumn, default = 'time'

  • ycolumn, default = 'nucleotides'

  • indexcolumn, default = 'trajectory'

  • method, options = 'global' or 'auto'. Default 'auto'. Global uses a global sigma value, specified in sigma, auto uses per-trajectory sigma values, calculated within the sigma_start-sigma_end window.

  • sigma, default = 500, sigma value used if method = 'global'

  • sigma_start, default = 0, start time (xcolumn) for sigma calculation used if method = 'auto'

  • sigma_end, default = 100, end time (xcolumn) for sigma calculation used if method = 'auto'

  • traj, default = 'none', specify an integer to analyse only a single trajectory.


In [6]:
segments = beadpy.segment_finder(results, sigma = params.coarsesigma)

Segments plotter, the input is:

1) segments table, 2) max rate cutoff, 3) y axis minimum, 4) y axis maximum, 5) legend location, 6) scaling factor for the size of the scatter points (adjust depending on the rates of the events).


In [9]:
beadpy.segmentplotter(segments,params.maxrate,-2000,7500, 1, 1)


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0xc3157b8>

Now filter for trajectories of interest. The best way to do this is to inspect the above plot and adjust the below parameters as desired. These parameters are: 1) results table, 2) segments table, 3) minimum rate, 4) maximum rate, 5) minimum displacement (in nt), 6) minimum start time, 7) maximum end time.


In [38]:
filtresults, filtsegments = beadpy.filterer(results, segments, params.minrate, 
                                            params.maxrate, 
                                            params.mindisplacement, 
                                            params.starttime, 
                                            params.endtime)

In [40]:
filtresults.trajectory.nunique()
#Number of trajectories in the filtered data set:


Out[40]:
584

It is usually best at this point to manually curate the filtered results table using the imagej multiplotter and save it in a file called 'curatedresults.csv'.


In [42]:
curatedresults = pd.read_csv("curatedresults.csv")
curatedresults.trajectory.nunique() #Number of unique trajectories in the manually curated data set:


Out[42]:
243

Further filter the filtered segments so that only those in the curated data set are retained.


In [12]:
curatedsegments = filtsegments[filtsegments.trajectory.isin(curatedresults.trajectory)]

Now do a fine-grained changepoint analysis of the automatically determined event regions of the filtered data set. This will be used to generate the rate histogram. This now uses per-trajectory sigma values calculated from the initial event-free period. Sigma_start and sigma_end should be specified in seconds.


In [13]:
segmentsfine = beadpy.ratefinder_autosigma(curatedresults,curatedsegments, sigma_start = 10, sigma_end = 100)

Alternatively, a global sigma value can be applied using the following function:


In [10]:
#segmentsfine = beadpy.ratefinder(curatedresults,curatedsegments,500)

Rate Histogram

The last option can be:

  • 'duration' to weight segments by duration in time
  • 'displacement' to weight segments by displacement in nucleotides
  • 'fastest' to select only the fastest segment from each event (with no weighting).
  • 'longest' to select only the longest segment from each event (with no weighting).
  • 'none' to include all segments, with no weighting.

In [14]:
beadpy.ratehist(segmentsfine,params.minrate, params.maxrate, params.ratebins, 'displacement')


Plot an example trajectory

Input:

  • resultstable
  • trajectory number
  • method: Either 'nofit' or a tuple.
    • The first option in the tuple can be either 'global' or 'auto' to choose the sigma value.
    • The second option can be: 'whole' to fit the whole trajectory, 'table' to provide a filtered segments table which defines the start and end times, or 'region' to specify a tuple which defines the event start and end times.
  • sigma: Used for the 'global' sigma method, ignored if using the 'auto' method.
  • eventregion: set as a tuple (e.g. (200.5, 306.0)) if using the 'region' option, ignored otherwise.
  • segmenttable: specify a filtered segment table if using the 'table' option to define the event region, ignored otherwise.
  • sigma_start - start time for sigma calculation if using 'auto'
  • sigma_end - end time for sigma calculation if using 'auto'

In [15]:
exampletraj = curatedresults.trajectory.unique()[4]
#exampletraj = 68
exampletrajseg = beadpy.trajectory_plotter(curatedresults, exampletraj, sigma = 500, method = ('auto', 'whole'), sigma_start = 10, sigma_end = 500,  eventregion = (200,500), segmenttable = 0)
exampletrajseg


Out[15]:
rate intercept x1 x2 y1 y2 displacement duration trajectory
0 -1.3 -126.0 0.00 22.25 -126.1 -154.4 -28.3 22.25 39.0
1 27.4 -549.0 22.25 26.00 61.9 164.8 102.9 3.75 39.0
2 -16.4 361.0 26.00 34.50 -65.8 -205.5 -139.7 8.50 39.0
3 76.8 -2905.0 34.50 40.25 -253.9 187.9 441.8 5.75 39.0
4 -48.1 2158.0 40.25 50.00 222.4 -246.6 -469.0 9.75 39.0
5 7.4 -475.0 50.00 80.00 -102.4 121.0 223.4 30.00 39.0
6 -82.1 6729.0 80.00 82.75 160.6 -65.2 -225.8 2.75 39.0
7 18.8 -1668.0 82.75 96.25 -109.0 145.4 254.3 13.50 39.0
8 2.1 -183.0 96.25 146.25 23.3 130.6 107.3 50.00 39.0
9 3.0 -406.0 146.25 183.50 32.6 144.3 111.7 37.25 39.0
10 -5.9 1306.0 183.50 192.50 218.2 164.8 -53.4 9.00 39.0
11 12.2 -2288.0 192.50 206.25 69.8 238.2 168.4 13.75 39.0
12 0.6 -83.0 206.25 260.25 32.9 63.2 30.3 54.00 39.0
13 -3.3 991.0 260.25 300.25 129.6 -2.7 -132.4 40.00 39.0
14 1.7 -394.0 300.25 358.00 109.0 205.8 96.8 57.75 39.0
15 2.1 -668.0 358.00 413.50 93.4 211.5 118.1 55.50 39.0
16 -0.8 471.0 413.50 455.25 159.0 127.5 -31.5 41.75 39.0
17 -0.9 637.0 455.25 521.25 220.7 160.4 -60.3 66.00 39.0
18 -5.3 3038.0 521.25 543.25 293.5 177.6 -115.8 22.00 39.0
19 9.8 -5214.0 543.25 555.50 101.2 221.0 119.8 12.25 39.0
20 189.9 -105351.0 555.50 568.25 122.9 2543.8 2420.9 12.75 39.0
21 256.9 -143471.0 568.25 573.25 2500.2 3784.6 1284.4 5.00 39.0
22 113.8 -61394.0 573.25 580.25 3845.9 4642.5 796.7 7.00 39.0
23 230.9 -129371.0 580.25 588.00 4621.3 6410.9 1789.6 7.75 39.0
24 -11.5 13365.0 588.00 591.25 6594.2 6556.7 -37.4 3.25 39.0
25 87.4 -45257.0 591.25 596.50 6419.2 6878.1 458.9 5.25 39.0
26 8.8 1299.0 596.50 603.50 6524.7 6586.0 61.3 7.00 39.0
27 -95.0 64077.0 603.50 605.25 6752.4 6586.2 -166.2 1.75 39.0
28 72.9 -37809.0 605.25 609.75 6306.3 6634.3 328.0 4.50 39.0
29 10.6 85.0 609.75 618.75 6519.7 6614.7 95.0 9.00 39.0
30 -6.6 10833.0 618.75 665.25 6730.2 6421.9 -308.3 46.50 39.0
31 53.4 -29262.0 665.25 673.75 6237.6 6691.2 453.6 8.50 39.0
32 42.1 -21928.0 673.75 679.50 6461.8 6704.1 242.3 5.75 39.0
33 -220.9 156883.0 679.50 681.50 6753.1 6311.2 -441.9 2.00 39.0
34 1.2 5612.0 681.50 686.00 6422.0 6427.3 5.3 4.50 39.0
35 -3.2 8841.0 686.00 700.25 6628.6 6582.6 -46.0 14.25 39.0
36 4.3 3714.0 700.25 710.50 6758.1 6802.7 44.6 10.25 39.0
37 6.6 1941.0 710.50 724.00 6628.0 6717.0 89.0 13.50 39.0

Processivity Histogram

Input is: 1) Segments table (Should be segments from event portions only), 2) Minimum processivity, 3) Maximum processivity, 4) Number of bins, 5) Number of bins to skip on the left in the fit, 5) Fitting guess in the form of [A,t] (optional)


In [16]:
beadpy.processivity(segmentsfine, 0, curatedsegments.displacement.max()*1.5, params.ratebins, 1)


<matplotlib.figure.Figure at 0xe2d4550>

Optional: Reload saved tables


In [5]:
results = pd.read_csv("pythonresults.csv")

In [3]:
#segments = pd.read_csv("pythonsegments_sigma500.csv") #change sigma value as necessary

In [4]:
#segmentsfine = pd.read_csv("pythonsegments_sigma500.csv") #change sigma value as necessary

In [3]:
filtresults = pd.read_csv("filtresults.csv")

In [3]:
curatedresults = pd.read_csv("curatedresults.csv")

In [6]:
filtsegments = pd.read_csv("filtsegments.csv")

In [7]:
segments = beadpy.segment_finder(curatedresults, sigma = 700)

In [7]:
curatedresults = pd.read_csv("sampledata.csv")