In [ ]:
import os
import pandas as pd
import re
import subprocess
import sys
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline
In [ ]:
sys.path.append('../code/')
from database_comparisons import Database, DatabaseComparison
In [ ]:
# for Waffle, until the default Java is set to 8:
#java="/usr/lib/jvm/java-8-oracle/jre/bin/java"
# for badger, set java = 'java'
java='java'
In [ ]:
! pwd
In [ ]:
! ls -l ../jars/*.jar
In [ ]:
! ls -l ../data_mining_Neo4j_v2_3_2/databases/
In [ ]:
files = os.listdir('../data_mining_Neo4j_v2_3_2/databases')
files = [f for f in files if 'db_50M_' in f and '.tsv' in f]
files
In [ ]:
cutoffs = sorted([float(re.search('50M_(\d+.\d*)', f).groups()[0]) for f in files])
In [ ]:
cutoffs
In [ ]:
dbc = DatabaseComparison(desc_string='50M')
dbc.make_dbs(cutoffs)
In [ ]:
dbc.summary
In [ ]:
dbc.databases
In [ ]:
p = dbc.plot_db_construction_time_vs_n_nodes(logx=True)
In [ ]:
p = dbc.plot_db_construction_time_vs_cutoff()
In [ ]:
p = dbc.plot_db_construction_time_vs_n_nodes(logx=True)
In [ ]:
p = dbc.plot_density_vs_nodes(logx=True)
In [ ]:
p = dbc.plot_base(x='cutoff', y='nodes', color='k', logy=True, figsize=(4.5, 2.5))
In [ ]:
dbc.summary.columns
In [ ]:
p = dbc.plot_base(x='cutoff', y='construction seconds', color='k', logy=True, figsize=(4.5, 2.5))
In [ ]:
p = dbc.plot_base(x='cutoff', y='construction minutes', color='k', logy=False, figsize=(4.5, 2.5))
In [ ]:
dbc.summary.head(2)
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='nodes', c='construction minutes', ax=ax)
plt.xlabel('partial correlation cutoff')
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,2))
dbc.summary.plot.scatter(x='edges', y='construction minutes', c='nodes', ax=ax)
plt.ylabel('database construction\ntime (minutes)')
plt.tight_layout()
fig.savefig('161213_db_construction_linear_with_edges.png', dpi=600)
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='construction minutes', y='nodes', c='cutoff', ax=ax)
plt.xlabel('database construction time (minutes)')
In [ ]:
sorted([float(re.search('50M_(\d+.\d*)', f).groups()[0]) for f in files])
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='# connected components', c='density', ax=ax)
plt.xlabel('partial correlation cutoff')
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='total # genes in connected components',
c='# connected components', ax=ax)
plt.xlabel('partial correlation cutoff')
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='nodes', y='# connected components', c='density', ax=ax)
#plt.xlabel('')
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='nodes', y='# connected components', c='cutoff', ax=ax)
#plt.xlabel('')
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='# connected components', c='nodes', ax=ax)
plt.xlabel('partial correlation cutoff')
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,2.5))
dbc.summary.plot.scatter(x='cutoff', y='# cc with multiple species',
c='# connected components', ax=ax)
plt.xlabel('partial correlation cutoff')
plt.ylabel('# of connected components\nwith multiple species')
plt.tight_layout()
fig.savefig('161213_cc_vs_pcor_cutoff.png', dpi=600)
In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='frac cc with multiple species',
c='# connected components', ax=ax)
plt.xlabel('partial correlation cutoff')
In [ ]:
for n, d in dbc.databases.items():
d.connected_components.histogram_of_nodes()
In [ ]:
for n, d in dbc.databases.items():
d.connected_components.histogram_of_species()
In [ ]:
p1 = dbc.cc_by_cutoff_num(0.0125).histogram_of_species()
In [ ]:
p1 = dbc.cc_by_cutoff_num(0.02).histogram_of_species()
In [ ]:
p2 = dbc.cc_by_cutoff_num(0.025).histogram_of_species()
In [ ]:
for n, d in dbc.databases.items():
try:
d.connected_components.heatmap_of_organisms_appearance_in_components()
except:
print(d.cutoff)
print('failed for {}'.format(n))
In [ ]: