notebook.community

Edit and run



In [25]:

    
from __future__ import absolute_import

import sys
sys.path.append("../lib")
import pandas as pd
import matplotlib.pyplot as plt
from database import Database, Result



In [13]:

    
a = """
sdf {aval}
""".format(**{"aval":"hello"})
print(a)









    



sdf hello



In [76]:

    
db = Database()

def runTimes(start_time, end_time, db):
    times = {
            "starttime": start_time,
            "endtime": end_time
            }

    sql = """
    select 
    a.job_id, a.starttime as mrbayes_starttime, a.timestamp as mrbayes_timestamp, 
    b.starttime as clustalo_starttime, b.timestamp as clustalo_timestamp,
    TIMESTAMPDIFF(SECOND, a.starttime, a.timestamp) as mrbayes_time,
    TIMESTAMPDIFF(SECOND, b.starttime, b.timestamp) as clustalo_time
    from (select * from results 
          where prog='mrbayes' and starttime > '{starttime}' and timestamp < '{endtime}'
          ) a 
    join (select * from results 
          where prog='clustalo' and starttime > '{starttime}' and timestamp < '{endtime}'
          ) b 
    on a.job_id = b.job_id 
    order by a.timestamp desc
    """.format(**times)

    return pd.read_sql_query(sql, db.conn)


# rq = {"order":"squamata","data.dwc:associatedSequences": {"type":"exists"}}
# limit = 30
# 4 sequences
seq_4 = runTimes("2015-12-09 00:21:00", "2015-12-09 00:28:00", db)
# rq = {"order":"squamata","data.dwc:associatedSequences": {"type":"exists"}}
# limit = 100
# 5 sequences
seq_5 = runTimes("2015-12-09 00:44:15", "2015-12-09 00:46:30", db)
# rq = {"order":"squamata","data.dwc:associatedSequences": {"type":"exists"}}
# limit = 1000
# 6 sequences
seq_6 = runTimes("2015-12-09 01:00:00", "2015-12-09 01:03:00", db)
# rq = {"order":"squamata","data.dwc:associatedSequences": {"type":"exists"}}
# limit = 1000
# 7 sequences
seq_7 = runTimes("2015-12-09 01:05:00", "2015-12-09 01:09:00", db)
# rq = {"order":"squamata","data.dwc:associatedSequences": {"type":"exists"}}
# limit = 1000
# 8 sequences
seq_8 = runTimes("2015-12-09 01:14:00", "2015-12-09 01:17:00", db)
# rq = {"order":"squamata","data.dwc:associatedSequences": {"type":"exists"}}
# limit = 1000
# 16 sequences
seq_16 = runTimes("2015-12-09 01:44:00", "2015-12-09 01:50:00", db)
print seq_16









    



                                 job_id   mrbayes_starttime  \
0  b18d1674-706d-4275-b0d3-cdc3b6caf7b9 2015-12-09 01:49:18   
1  a1b7676c-9d94-42c8-938d-8151b2a8dd58 2015-12-09 01:48:50   
2  090771c3-cdd3-4a84-9cf8-36c1e362f6f7 2015-12-09 01:48:24   
3  b3b301b9-0b84-4aaf-b341-e16f16c5a534 2015-12-09 01:48:01   
4  d34f7fd9-5dc5-437e-9bc3-7aff0a6d9b02 2015-12-09 01:47:37   
5  b74ffbb4-492b-4a4e-872e-d954a5052892 2015-12-09 01:47:14   
6  cd33cffc-c04f-4895-8ed6-6089a7dd7e1d 2015-12-09 01:46:48   
7  7235f66e-e943-4e89-b66c-c126f6b52c82 2015-12-09 01:46:24   
8  75394e50-fd68-4c92-8cb9-f49daccfb30f 2015-12-09 01:46:02   
9  540cd4d0-4145-467e-842b-bd06c2f8874e 2015-12-09 01:44:47   

    mrbayes_timestamp  clustalo_starttime  clustalo_timestamp  mrbayes_time  \
0 2015-12-09 01:49:33 2015-12-09 01:49:14 2015-12-09 01:49:18            15   
1 2015-12-09 01:49:04 2015-12-09 01:48:46 2015-12-09 01:48:50            14   
2 2015-12-09 01:48:39 2015-12-09 01:48:20 2015-12-09 01:48:24            15   
3 2015-12-09 01:48:16 2015-12-09 01:47:57 2015-12-09 01:48:01            15   
4 2015-12-09 01:47:52 2015-12-09 01:47:33 2015-12-09 01:47:37            15   
5 2015-12-09 01:47:29 2015-12-09 01:47:10 2015-12-09 01:47:14            15   
6 2015-12-09 01:47:06 2015-12-09 01:46:44 2015-12-09 01:46:48            18   
7 2015-12-09 01:46:40 2015-12-09 01:46:20 2015-12-09 01:46:24            16   
8 2015-12-09 01:46:17 2015-12-09 01:45:59 2015-12-09 01:46:02            15   
9 2015-12-09 01:45:05 2015-12-09 01:44:43 2015-12-09 01:44:47            18   

   clustalo_time  
0              4  
1              4  
2              4  
3              4  
4              4  
5              4  
6              4  
7              4  
8              3  
9              4



In [77]:

    
# the slices above need to be averaged together in to a new df
df = pd.DataFrame()
#df["seq_4_mrbayes"]
mean_clustalo = [seq_4["clustalo_time"].mean(), seq_5["clustalo_time"].mean(),
                seq_6["clustalo_time"].mean(), seq_7["clustalo_time"].mean(),
                seq_8["clustalo_time"].mean(), seq_16["clustalo_time"].mean()]
mean_mrbayes = [seq_4["mrbayes_time"].mean(), seq_5["mrbayes_time"].mean(),
               seq_6["mrbayes_time"].mean(), seq_7["mrbayes_time"].mean(),
               seq_8["mrbayes_time"].mean(), seq_16["mrbayes_time"].mean()]
df["labels"] = ["4", "5", "6", "7", "8", "16"]
df["mean_clustalo"] = mean_clustalo
df["mean_mrbayes"] = mean_mrbayes
#print df



In [80]:

    
%matplotlib inline
#http://chrisalbon.com/python/matplotlib_stacked_bar_plot.html
f, ax1 = plt.subplots(1, figsize=(10,5))
bar_width = 0.75
bar_l = [i+1 for i in range(len(df['labels']))]
tick_pos = [i+(bar_width/2) for i in bar_l]
ax1.bar(bar_l,
        # using the pre_score data
        df['mean_clustalo'],
        # set the width
        width=bar_width,
        # with the label pre score
        label='Clustal O',
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#F4561D')
# Create a bar plot, in position bar_1
ax1.bar(bar_l,
        # using the mid_score data
        df['mean_mrbayes'],
        # set the width
        width=bar_width,
        # with pre_score on the bottom
        bottom=df['mean_clustalo'],
        # with the label mid score
        label='MrBayes',
        # with alpha 0.5
        alpha=0.5,
        # with color
        color='#F1911E')
plt.xticks(tick_pos, df['labels'])
ax1.set_ylabel("Seconds Used by Worker")
ax1.set_xlabel("Number of Sequences")
plt.legend(loc='upper left')
plt.xlim([min(tick_pos)-bar_width, max(tick_pos)+bar_width])









    Out[80]:





(0.625, 7.125)



In [ ]: