In [ ]:
import os
import pandas as pd
import re
import subprocess
import sys
import matplotlib as mpl
mpl.use('Agg') 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

In [ ]:
sys.path.append('../code/')

from database_comparisons import Database, DatabaseComparison

In [ ]:
# for Waffle, until the default Java is set to 8:
#java="/usr/lib/jvm/java-8-oracle/jre/bin/java"
# for badger, set java = 'java'
java='java'

In [ ]:
! pwd

In [ ]:
! ls -l ../jars/*.jar

In [ ]:
! ls -l ../data_mining_Neo4j_v2_3_2/databases/

In [ ]:
files = os.listdir('../data_mining_Neo4j_v2_3_2/databases')
files = [f for f in files if 'db_50M_' in f and '.tsv' in f]
files

In [ ]:
cutoffs = sorted([float(re.search('50M_(\d+.\d*)', f).groups()[0]) for f in files])

In [ ]:
cutoffs

In [ ]:
dbc  = DatabaseComparison(desc_string='50M')
dbc.make_dbs(cutoffs)

In [ ]:
dbc.summary

In [ ]:
dbc.databases

In [ ]:
p = dbc.plot_db_construction_time_vs_n_nodes(logx=True)

In [ ]:
p = dbc.plot_db_construction_time_vs_cutoff()

In [ ]:
p = dbc.plot_db_construction_time_vs_n_nodes(logx=True)

In [ ]:
p = dbc.plot_density_vs_nodes(logx=True)

In [ ]:
p = dbc.plot_base(x='cutoff', y='nodes', color='k', logy=True, figsize=(4.5, 2.5))

In [ ]:
dbc.summary.columns

In [ ]:
p = dbc.plot_base(x='cutoff', y='construction seconds', color='k', logy=True, figsize=(4.5, 2.5))

In [ ]:
p = dbc.plot_base(x='cutoff', y='construction minutes', color='k', logy=False, figsize=(4.5, 2.5))

In [ ]:
dbc.summary.head(2)

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='nodes', c='construction minutes', ax=ax)
plt.xlabel('partial correlation cutoff')

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,2))
dbc.summary.plot.scatter(x='edges', y='construction minutes', c='nodes', ax=ax)
plt.ylabel('database construction\ntime (minutes)')
plt.tight_layout()
fig.savefig('161213_db_construction_linear_with_edges.png', dpi=600)

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='construction minutes', y='nodes', c='cutoff', ax=ax)
plt.xlabel('database construction time (minutes)')

In [ ]:
sorted([float(re.search('50M_(\d+.\d*)', f).groups()[0]) for f in files])

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='# connected components', c='density', ax=ax)
plt.xlabel('partial correlation cutoff')

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='total # genes in connected components', 
                         c='# connected components', ax=ax)
plt.xlabel('partial correlation cutoff')

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='nodes', y='# connected components', c='density', ax=ax)
#plt.xlabel('')

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='nodes', y='# connected components', c='cutoff', ax=ax)
#plt.xlabel('')

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='# connected components', c='nodes', ax=ax)
plt.xlabel('partial correlation cutoff')

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,2.5))
dbc.summary.plot.scatter(x='cutoff', y='# cc with multiple species',
                         c='# connected components', ax=ax)
plt.xlabel('partial correlation cutoff')
plt.ylabel('# of connected components\nwith multiple species')
plt.tight_layout()
fig.savefig('161213_cc_vs_pcor_cutoff.png', dpi=600)

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='frac cc with multiple species',
                         c='# connected components', ax=ax)
plt.xlabel('partial correlation cutoff')

In [ ]:
for n, d in dbc.databases.items():
    d.connected_components.histogram_of_nodes()

In [ ]:
for n, d in dbc.databases.items():
    d.connected_components.histogram_of_species()

In [ ]:
p1 = dbc.cc_by_cutoff_num(0.0125).histogram_of_species()

In [ ]:
p1 = dbc.cc_by_cutoff_num(0.02).histogram_of_species()

In [ ]:
p2 = dbc.cc_by_cutoff_num(0.025).histogram_of_species()

In [ ]:
for n, d in dbc.databases.items():
    try:
        d.connected_components.heatmap_of_organisms_appearance_in_components()
    except:
        print(d.cutoff)
        print('failed for {}'.format(n))

In [ ]: