notebook.community

Edit and run



In [43]:

    
import pandas as pd
import matplotlib.pyplot as plt, mpld3
import scipy
from scipy import stats
import numpy as np
import csv

# poretools times pass > pass_times.txt
# poretools times fail > fail_times.txt
# poretools fastq --type 2D pass > pass_2d.txt
# poretools fastq --type 2D fail > fail_2d.txt
# awk 'NR == 1 || NR % 4 == 1' pass_2d.txt > 2d_strands.txt 
# awk 'NR == 1 || NR % 4 == 1' fail_2d.txt >> 2d_strands.txt



In [27]:

    
# Get the file names of only the 2D reads
f = open('2d_strands.txt', 'r')
content = f.readlines()
twoDim_files = []
for line in content:
    ix = str.rfind(line, 'twodirections:')
    twoDim_files.append(line[ix+14:-1]) # Remove '\n' at end



In [38]:

    
pass_txt_file = r"pass_times.txt"
pass_csv_file = r"pass_times.csv"
in_txt = csv.reader(open(pass_txt_file, "rb"), delimiter = '\t')
out_csv = csv.writer(open(pass_csv_file, 'wb'))
out_csv.writerows(in_txt)
pass_times = pd.read_csv('pass_times.csv')

fail_txt_file = r"fail_times.txt"
fail_csv_file = r"fail_times.csv"
in_txt = csv.reader(open(fail_txt_file, "rb"), delimiter = '\t')
out_csv = csv.writer(open(fail_csv_file, 'wb'))
out_csv.writerows(in_txt)
fail_times = pd.read_csv('fail_times.csv')

frames = [pass_times, fail_times]
times = pd.concat(frames)

times = times[times['filename'].isin(twoDim_files)] #Only select 2D reads



In [49]:

    
timestamp = times['unix_timestamp']
readlength = times['read_length']
axes_range = [1445650000, 1445800000, 0, 75000]

f, ax = plt.subplots()
plt.axis(axes_range)
ax.xaxis.set_ticks(np.arange(1445650000, 1445800001, 50000))

ax.set_title('sequence length versus timestamp of entry')
ax.set_ylabel('sequence length')
ax.set_xlabel('timestamp of entry')
ax.scatter(timestamp, readlength)

mpld3.display()









    Out[49]:



In [44]:

    
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(timestamp, readlength)
print r_value









    



0.0599966384681



In [ ]: