notebook.community

Edit and run



In [213]:

    
import os
import time
import pandas as pd

def get_timestamps(directory_path):
    timestamps = []
    files = []
    timestamp_dic = {}
    
    for root, _, filenames in os.walk(directory_path):
        for filename in filenames:
            file_path   = root + '/' + filename
            created     = os.path.getctime(file_path)
            
            timestamps.append(created)
            files.append(filename)

    for i, f in enumerate(files):
        timestamp_dic[f] = timestamps[i]
    
    return timestamp_dic
                  
fail_path = '/Volumes/Seagate Expansion Drive/hackathon2_data/downloads/downloads/fail'
fail_timestamps_dic = get_timestamps(fail_path)

pass_path = '/Volumes/Seagate Expansion Drive/hackathon2_data/downloads/downloads/pass'
pass_timestamps_dic = get_timestamps(pass_path)



In [214]:

    
# get_BPs.sh downloads
# Returns fail_BPcount.txt & pass_BPcount.txt

fail_bp_file = 'fail_BPcount.txt'
pass_bp_file = 'pass_BPcount.txt'
pass_bp_dic = {}
fail_bp_dic = {}

f = open(pass_bp_file, 'r')
reads = f.readlines()
for i,r in enumerate(reads):
    entry = r.split()
    if len(entry) == 4:
        pass_bp_dic[reads[i-1][15:-1]] = int(entry[3])

f = open(fail_bp_file, 'r')
reads = f.readlines()
for i,r in enumerate(reads):
    entry = r.split()
    if len(entry) == 4:
        fail_bp_dic[reads[i-1][15:-1]] = int(entry[3])



In [218]:

    
fail2d_files = []
fail2d_timestamps = []
fail2d_bp_count = []

# Get the file names of only the 2D FAIL reads
f = open('fastq_2D_fail.txt', 'r')
reads = f.readlines()
for line in reads:
    ix = str.rfind(line, 'twodirections:')
    fail2d_files.append(line[ix+29:-1]) # Remove '\n' at end
    
# File in timestamp/nucleotide lists
for f in fail2d_files:
    if fail_timestamps_dic.has_key(f) and fail_bp_dic.has_key(f):
        fail2d_timestamps.append(fail_timestamps_dic[f])
        fail2d_bp_count.append(fail_bp_dic[f])  
        
fail_min = min(fail2d_timestamps)
fail_max = max(fail2d_timestamps)
interval = (fail_min-fail_max)/5

fail2d_timestamps.sort()
df = pd.DataFrame(data=fail2d_bp_count, index=fail2d_timestamps)

df.columns = ['nucleotide frequency']
df = df.cumsum()
ax = df.plot()
ax.set_title('cummulative nucleotide count versus timestamp of entry - FAIL_2D')
ax.set_xlabel("time")
ax.set_ylabel("nucleotides")
ax.xaxis.set_ticks(np.arange(fail_max,fail_min, interval))

mpld3.display()









    Out[218]:



In [216]:

    
import pandas as pd

pass2d_files = []
pass2d_timestamps = []
pass2d_bp_count = []

# Get the file names of only the 2D PASS reads
f = open('fastq_2D_pass.txt', 'r')
reads = f.readlines()
for line in reads:
    ix = str.rfind(line, 'twodirections:')
    pass2d_files.append(line[ix+29:-1]) # Remove '\n' at end

# File in timestamp/nucleotide lists
for f in pass2d_files:
    pass2d_timestamps.append(pass_timestamps_dic[f])
    pass2d_bp_count.append(pass_bp_dic[f])  

pass_min = min(pass2d_timestamps)
pass_max = max(pass2d_timestamps)
interval = (pass_min-pass_max)/5

fail2d_timestamps.sort()
df = pd.DataFrame(data=pass2d_bp_count, index=pass2d_timestamps)

df.columns = ['nucleotide frequency']
df = df.cumsum()
ax = df.plot()
ax.set_title('cummulative nucleotide count versus timestamp of entry - PASS_2D')
ax.set_xlabel("time")
ax.set_ylabel("nucleotides")
ax.xaxis.set_ticks(np.arange(pass_max,pass_min, interval))

mpld3.display()









    Out[216]:



In [226]:

    
fail_path = '/Volumes/Seagate Expansion Drive/hackathon2_data/downloads/downloads/fail'
pass_path = '/Volumes/Seagate Expansion Drive/hackathon2_data/downloads/downloads/pass'

def get_timestamps(directory_path):
    timestamps = []
    files = []
    timestamp_dic = {}
    
    for root, _, filenames in os.walk(directory_path):
        for filename in filenames:
            file_path   = root + '/' + filename
            created     = os.path.getctime(file_path)
            
            timestamps.append(created)
            files.append(filename)

    for i, f in enumerate(files):
        timestamp_dic[f] = timestamps[i]
    
    return timestamp_dic

[]



In [265]:

    
fail1d_files = list(set(fail_timestamps_dic.keys()) - set(fail2d_files))
pass1d_files = list(set(pass_timestamps_dic.keys()) - set(pass2d_files))



In [274]:

    
fail1d_timestamps = []
fail1d_bp_count = []

if fail1d_files:

    # File in timestamp/nucleotide lists
    for f in fail1d_files:
        if fail_timestamps_dic.has_key(f) and fail_bp_dic.has_key(f):
            fail1d_timestamps.append(fail_timestamps_dic[f])
            fail1d_bp_count.append(fail_bp_dic[f])  
    
    if not fail1d_timestamps:
        print "Fail 2D reads did not have readable values"
        for f in fail1d_files:
            if fail_timestamps_dic.has_key(f):
                fail1d_timestamps.append(fail_timestamps_dic[f])
                fail1d_bp_count.append(3000)
    
    fail_min = min(fail1d_timestamps)
    fail_max = max(fail1d_timestamps)
    interval = (fail_min-fail_max)/5

    fail1d_timestamps.sort()
    df = pd.DataFrame(data=fail1d_bp_count, index=fail1d_timestamps)

    df.columns = ['nucleotide frequency']
    df = df.cumsum()
    ax = df.plot()
    ax.set_title('cummulative nucleotide count versus timestamp of entry - FAIL_2D')
    ax.set_xlabel("time")
    ax.set_ylabel("nucleotides")
    ax.xaxis.set_ticks(np.arange(fail_max,fail_min, interval))
    
else:
    print "There are no fail 1D reads"
    
mpld3.display()









    



Fail 2D reads did not have readable values






    Out[274]:



In [267]:

    
pass1d_timestamps = []
pass1d_bp_count = []

if pass1d_files:
    # File in timestamp/nucleotide lists
    for f in pass1d_files:
        if pass_timestamps_dic.has_key(f) and pass_bp_dic.has_key(f):
            pass1d_timestamps.append(pass_timestamps_dic[f])
            pass1d_bp_count.append(pass_bp_dic[f])  

    pass_min = min(pass1d_timestamps)
    pass_max = max(pass1d_timestamps)
    interval = (pass_min-pass_max)/5

    pass1d_timestamps.sort()
    df = pd.DataFrame(data=pass1d_bp_count, index=pass1d_timestamps)

    df.columns = ['nucleotide frequency']
    df = df.cumsum()
    ax = df.plot()
    ax.set_title('cummulative nucleotide count versus timestamp of entry - FAIL_2D')
    ax.set_xlabel("time")
    ax.set_ylabel("nucleotides")
    ax.xaxis.set_ticks(np.arange(fail_max,fail_min, interval))

    mpld3.display()
else:
    print "There are no 1D pass reads"









    



There are no 1D pass reads



In [ ]: