In [12]:
import pandas as pd
import matplotlib.pyplot as plt, mpld3
from scipy import stats
import numpy as np
import csv
# poretools times pass > pass_times.txt
# poretools times fail > fail_times.txt
# poretools fastq --type 2D pass > pass_2d.txt
# poretools fastq --type 2D fail > fail_2d.txt
# awk 'NR == 1 || NR % 4 == 1' pass_2d.txt > 2d_strands.txt
# awk 'NR == 1 || NR % 4 == 1' fail_2d.txt >> 2d_strands.txt
In [3]:
# Get the file names of only the 2D reads
f = open('2d_strands.txt', 'r')
content = f.readlines()
twoDim_files = []
for line in content:
ix = str.rfind(line, 'twodirections:')
twoDim_files.append(line[ix+14:-1]) # Remove '\n' at end
In [17]:
pass_txt_file = r"pass_times.txt"
pass_csv_file = r"pass_times.csv"
in_txt = csv.reader(open(pass_txt_file, "rb"), delimiter = '\t')
out_csv = csv.writer(open(pass_csv_file, 'wb'))
out_csv.writerows(in_txt)
pass_times = pd.read_csv('pass_times.csv')
fail_txt_file = r"fail_times.txt"
fail_csv_file = r"fail_times.csv"
in_txt = csv.reader(open(fail_txt_file, "rb"), delimiter = '\t')
out_csv = csv.writer(open(fail_csv_file, 'wb'))
out_csv.writerows(in_txt)
fail_times = pd.read_csv('fail_times.csv')
fail_times = fail_times[fail_times['filename'].isin(twoDim_files)] #Only select 2D reads
pass_times = pass_times[pass_times['filename'].isin(twoDim_files)]
In [29]:
read_lengths = fail_times['read_length']
duration = fail_times['duration']
axes_range = [0, 1000, 0, 20000]
f, ax = plt.subplots()
plt.axis(axes_range)
ax.set_title('FAILED - sequence_length over time')
ax.set_ylabel('sequence length')
ax.set_xlabel('duration in pore')
ax.scatter(duration, read_lengths)
mpld3.display()
Out[29]:
In [30]:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(duration, read_lengths)
print r_value
In [31]:
read_lengths = pass_times['read_length']
duration = pass_times['duration']
axes_range = [0, 700, 0, 75000]
f, ax = plt.subplots()
plt.axis(axes_range)
ax.set_title('PASSED - sequence_length over time')
ax.set_ylabel('sequence length')
ax.set_xlabel('duration in pore')
ax.scatter(duration, read_lengths)
mpld3.display()
Out[31]:
In [32]:
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(duration, read_lengths)
print r_value
In [ ]: