In [213]:
import os
import time
import pandas as pd
def get_timestamps(directory_path):
timestamps = []
files = []
timestamp_dic = {}
for root, _, filenames in os.walk(directory_path):
for filename in filenames:
file_path = root + '/' + filename
created = os.path.getctime(file_path)
timestamps.append(created)
files.append(filename)
for i, f in enumerate(files):
timestamp_dic[f] = timestamps[i]
return timestamp_dic
fail_path = '/Volumes/Seagate Expansion Drive/hackathon2_data/downloads/downloads/fail'
fail_timestamps_dic = get_timestamps(fail_path)
pass_path = '/Volumes/Seagate Expansion Drive/hackathon2_data/downloads/downloads/pass'
pass_timestamps_dic = get_timestamps(pass_path)
In [214]:
# get_BPs.sh downloads
# Returns fail_BPcount.txt & pass_BPcount.txt
fail_bp_file = 'fail_BPcount.txt'
pass_bp_file = 'pass_BPcount.txt'
pass_bp_dic = {}
fail_bp_dic = {}
f = open(pass_bp_file, 'r')
reads = f.readlines()
for i,r in enumerate(reads):
entry = r.split()
if len(entry) == 4:
pass_bp_dic[reads[i-1][15:-1]] = int(entry[3])
f = open(fail_bp_file, 'r')
reads = f.readlines()
for i,r in enumerate(reads):
entry = r.split()
if len(entry) == 4:
fail_bp_dic[reads[i-1][15:-1]] = int(entry[3])
In [218]:
fail2d_files = []
fail2d_timestamps = []
fail2d_bp_count = []
# Get the file names of only the 2D FAIL reads
f = open('fastq_2D_fail.txt', 'r')
reads = f.readlines()
for line in reads:
ix = str.rfind(line, 'twodirections:')
fail2d_files.append(line[ix+29:-1]) # Remove '\n' at end
# File in timestamp/nucleotide lists
for f in fail2d_files:
if fail_timestamps_dic.has_key(f) and fail_bp_dic.has_key(f):
fail2d_timestamps.append(fail_timestamps_dic[f])
fail2d_bp_count.append(fail_bp_dic[f])
fail_min = min(fail2d_timestamps)
fail_max = max(fail2d_timestamps)
interval = (fail_min-fail_max)/5
fail2d_timestamps.sort()
df = pd.DataFrame(data=fail2d_bp_count, index=fail2d_timestamps)
df.columns = ['nucleotide frequency']
df = df.cumsum()
ax = df.plot()
ax.set_title('cummulative nucleotide count versus timestamp of entry - FAIL_2D')
ax.set_xlabel("time")
ax.set_ylabel("nucleotides")
ax.xaxis.set_ticks(np.arange(fail_max,fail_min, interval))
mpld3.display()
Out[218]:
In [216]:
import pandas as pd
pass2d_files = []
pass2d_timestamps = []
pass2d_bp_count = []
# Get the file names of only the 2D PASS reads
f = open('fastq_2D_pass.txt', 'r')
reads = f.readlines()
for line in reads:
ix = str.rfind(line, 'twodirections:')
pass2d_files.append(line[ix+29:-1]) # Remove '\n' at end
# File in timestamp/nucleotide lists
for f in pass2d_files:
pass2d_timestamps.append(pass_timestamps_dic[f])
pass2d_bp_count.append(pass_bp_dic[f])
pass_min = min(pass2d_timestamps)
pass_max = max(pass2d_timestamps)
interval = (pass_min-pass_max)/5
fail2d_timestamps.sort()
df = pd.DataFrame(data=pass2d_bp_count, index=pass2d_timestamps)
df.columns = ['nucleotide frequency']
df = df.cumsum()
ax = df.plot()
ax.set_title('cummulative nucleotide count versus timestamp of entry - PASS_2D')
ax.set_xlabel("time")
ax.set_ylabel("nucleotides")
ax.xaxis.set_ticks(np.arange(pass_max,pass_min, interval))
mpld3.display()
Out[216]:
In [226]:
fail_path = '/Volumes/Seagate Expansion Drive/hackathon2_data/downloads/downloads/fail'
pass_path = '/Volumes/Seagate Expansion Drive/hackathon2_data/downloads/downloads/pass'
def get_timestamps(directory_path):
timestamps = []
files = []
timestamp_dic = {}
for root, _, filenames in os.walk(directory_path):
for filename in filenames:
file_path = root + '/' + filename
created = os.path.getctime(file_path)
timestamps.append(created)
files.append(filename)
for i, f in enumerate(files):
timestamp_dic[f] = timestamps[i]
return timestamp_dic
In [265]:
fail1d_files = list(set(fail_timestamps_dic.keys()) - set(fail2d_files))
pass1d_files = list(set(pass_timestamps_dic.keys()) - set(pass2d_files))
In [274]:
fail1d_timestamps = []
fail1d_bp_count = []
if fail1d_files:
# File in timestamp/nucleotide lists
for f in fail1d_files:
if fail_timestamps_dic.has_key(f) and fail_bp_dic.has_key(f):
fail1d_timestamps.append(fail_timestamps_dic[f])
fail1d_bp_count.append(fail_bp_dic[f])
if not fail1d_timestamps:
print "Fail 2D reads did not have readable values"
for f in fail1d_files:
if fail_timestamps_dic.has_key(f):
fail1d_timestamps.append(fail_timestamps_dic[f])
fail1d_bp_count.append(3000)
fail_min = min(fail1d_timestamps)
fail_max = max(fail1d_timestamps)
interval = (fail_min-fail_max)/5
fail1d_timestamps.sort()
df = pd.DataFrame(data=fail1d_bp_count, index=fail1d_timestamps)
df.columns = ['nucleotide frequency']
df = df.cumsum()
ax = df.plot()
ax.set_title('cummulative nucleotide count versus timestamp of entry - FAIL_2D')
ax.set_xlabel("time")
ax.set_ylabel("nucleotides")
ax.xaxis.set_ticks(np.arange(fail_max,fail_min, interval))
else:
print "There are no fail 1D reads"
mpld3.display()
Out[274]:
In [267]:
pass1d_timestamps = []
pass1d_bp_count = []
if pass1d_files:
# File in timestamp/nucleotide lists
for f in pass1d_files:
if pass_timestamps_dic.has_key(f) and pass_bp_dic.has_key(f):
pass1d_timestamps.append(pass_timestamps_dic[f])
pass1d_bp_count.append(pass_bp_dic[f])
pass_min = min(pass1d_timestamps)
pass_max = max(pass1d_timestamps)
interval = (pass_min-pass_max)/5
pass1d_timestamps.sort()
df = pd.DataFrame(data=pass1d_bp_count, index=pass1d_timestamps)
df.columns = ['nucleotide frequency']
df = df.cumsum()
ax = df.plot()
ax.set_title('cummulative nucleotide count versus timestamp of entry - FAIL_2D')
ax.set_xlabel("time")
ax.set_ylabel("nucleotides")
ax.xaxis.set_ticks(np.arange(fail_max,fail_min, interval))
mpld3.display()
else:
print "There are no 1D pass reads"
In [ ]: