In [43]:
import pandas as pd
import matplotlib.pyplot as plt, mpld3
import scipy
from scipy import stats
import numpy as np
import csv
# poretools times pass > pass_times.txt
# poretools times fail > fail_times.txt
# poretools fastq --type 2D pass > pass_2d.txt
# poretools fastq --type 2D fail > fail_2d.txt
# awk 'NR == 1 || NR % 4 == 1' pass_2d.txt > 2d_strands.txt
# awk 'NR == 1 || NR % 4 == 1' fail_2d.txt >> 2d_strands.txt
In [27]:
# Get the file names of only the 2D reads
f = open('2d_strands.txt', 'r')
content = f.readlines()
twoDim_files = []
for line in content:
ix = str.rfind(line, 'twodirections:')
twoDim_files.append(line[ix+14:-1]) # Remove '\n' at end
In [38]:
pass_txt_file = r"pass_times.txt"
pass_csv_file = r"pass_times.csv"
in_txt = csv.reader(open(pass_txt_file, "rb"), delimiter = '\t')
out_csv = csv.writer(open(pass_csv_file, 'wb'))
out_csv.writerows(in_txt)
pass_times = pd.read_csv('pass_times.csv')
fail_txt_file = r"fail_times.txt"
fail_csv_file = r"fail_times.csv"
in_txt = csv.reader(open(fail_txt_file, "rb"), delimiter = '\t')
out_csv = csv.writer(open(fail_csv_file, 'wb'))
out_csv.writerows(in_txt)
fail_times = pd.read_csv('fail_times.csv')
frames = [pass_times, fail_times]
times = pd.concat(frames)
times = times[times['filename'].isin(twoDim_files)] #Only select 2D reads
In [49]:
timestamp = times['unix_timestamp']
readlength = times['read_length']
axes_range = [1445650000, 1445800000, 0, 75000]
f, ax = plt.subplots()
plt.axis(axes_range)
ax.xaxis.set_ticks(np.arange(1445650000, 1445800001, 50000))
ax.set_title('sequence length versus timestamp of entry')
ax.set_ylabel('sequence length')
ax.set_xlabel('timestamp of entry')
ax.scatter(timestamp, readlength)
mpld3.display()
Out[49]:
In [44]:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(timestamp, readlength)
print r_value
In [ ]: