notebook.community

Edit and run



In [ ]:

    
import os
import pandas as pd
import re
import subprocess
import sys
import matplotlib as mpl
mpl.use('Agg') 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline



In [ ]:

    
sys.path.append('../code/')

from database_comparisons import Database, DatabaseComparison



In [ ]:

    
# for Waffle, until the default Java is set to 8:
#java="/usr/lib/jvm/java-8-oracle/jre/bin/java"
# for badger, set java = 'java'
java='java'



In [ ]:

    
! pwd



In [ ]:

    
! ls -l ../jars/*.jar



In [ ]:

    
! ls -l ../data_mining_Neo4j_v2_3_2/databases/



In [ ]:

    
files = os.listdir('../data_mining_Neo4j_v2_3_2/databases')
files = [f for f in files if 'db_50M_' in f and '.tsv' in f]
files



In [ ]:

    
cutoffs = sorted([float(re.search('50M_(\d+.\d*)', f).groups()[0]) for f in files])



In [ ]:

    
cutoffs



In [ ]:

    
dbc  = DatabaseComparison(desc_string='50M')
dbc.make_dbs(cutoffs)



In [ ]:

    
dbc.summary



In [ ]:

    
dbc.databases



In [ ]:

    
p = dbc.plot_db_construction_time_vs_n_nodes(logx=True)



In [ ]:

    
p = dbc.plot_db_construction_time_vs_cutoff()



In [ ]:

    
p = dbc.plot_db_construction_time_vs_n_nodes(logx=True)



In [ ]:

    
p = dbc.plot_density_vs_nodes(logx=True)



In [ ]:

    
p = dbc.plot_base(x='cutoff', y='nodes', color='k', logy=True, figsize=(4.5, 2.5))



In [ ]:

    
dbc.summary.columns



In [ ]:

    
p = dbc.plot_base(x='cutoff', y='construction seconds', color='k', logy=True, figsize=(4.5, 2.5))



In [ ]:

    
p = dbc.plot_base(x='cutoff', y='construction minutes', color='k', logy=False, figsize=(4.5, 2.5))



In [ ]:

    
dbc.summary.head(2)



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='nodes', c='construction minutes', ax=ax)
plt.xlabel('partial correlation cutoff')



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(5,2))
dbc.summary.plot.scatter(x='edges', y='construction minutes', c='nodes', ax=ax)
plt.ylabel('database construction\ntime (minutes)')
plt.tight_layout()
fig.savefig('161213_db_construction_linear_with_edges.png', dpi=600)



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='construction minutes', y='nodes', c='cutoff', ax=ax)
plt.xlabel('database construction time (minutes)')



In [ ]:

    
sorted([float(re.search('50M_(\d+.\d*)', f).groups()[0]) for f in files])



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='# connected components', c='density', ax=ax)
plt.xlabel('partial correlation cutoff')



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='total # genes in connected components', 
                         c='# connected components', ax=ax)
plt.xlabel('partial correlation cutoff')



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='nodes', y='# connected components', c='density', ax=ax)
#plt.xlabel('')



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='nodes', y='# connected components', c='cutoff', ax=ax)
#plt.xlabel('')



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='# connected components', c='nodes', ax=ax)
plt.xlabel('partial correlation cutoff')



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(5,2.5))
dbc.summary.plot.scatter(x='cutoff', y='# cc with multiple species',
                         c='# connected components', ax=ax)
plt.xlabel('partial correlation cutoff')
plt.ylabel('# of connected components\nwith multiple species')
plt.tight_layout()
fig.savefig('161213_cc_vs_pcor_cutoff.png', dpi=600)



In [ ]:

    
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='frac cc with multiple species',
                         c='# connected components', ax=ax)
plt.xlabel('partial correlation cutoff')



In [ ]:

    
for n, d in dbc.databases.items():
    d.connected_components.histogram_of_nodes()



In [ ]:

    
for n, d in dbc.databases.items():
    d.connected_components.histogram_of_species()



In [ ]:

    
p1 = dbc.cc_by_cutoff_num(0.0125).histogram_of_species()



In [ ]:

    
p1 = dbc.cc_by_cutoff_num(0.02).histogram_of_species()



In [ ]:

    
p2 = dbc.cc_by_cutoff_num(0.025).histogram_of_species()



In [ ]:

    
for n, d in dbc.databases.items():
    try:
        d.connected_components.heatmap_of_organisms_appearance_in_components()
    except:
        print(d.cutoff)
        print('failed for {}'.format(n))



In [ ]: