In [ ]:
import os
import pandas as pd
import re
import subprocess
import sys
import matplotlib as mpl
mpl.use('Agg') 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

In [ ]:
sys.path.append('../code/')

from database_comparisons import Database, DatabaseComparison
from connected_component import ConnectedComponentsDB

In [ ]:
# for Waffle, until the default Java is set to 8:
#java="/usr/lib/jvm/java-8-oracle/jre/bin/java"
# for badger, set java = 'java'
java='java'

In [ ]:
! pwd

In [ ]:
! ls -l ../jars/*.jar
! ls -l ../data_mining_Neo4j_v2_3_2/databases/

In [ ]:
files = os.listdir('../data_mining_Neo4j_v2_3_2/databases')
files = [f for f in files if 'db_50M_' in f and '.tsv' in f]
files

In [ ]:
sorted([float(re.search('50M_(\d+.\d*)', f).groups()[0]) for f in files], reverse=True)[0:3]

In [ ]:
ccdb = ConnectedComponentsDB(cutoff=0.015, desc_string='50M')

In [ ]:
ccdb.node_df.sort_values(by='ConnectedComponents')

In [ ]:
ccdb.node_df.groupby('ConnectedComponents')['organism'].nunique()

In [ ]:
ccdb.node_df.head(2)

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
#plt.yscale('log', nonposy='clip')
plot_series = ccdb.node_df.groupby('ConnectedComponents')['organism'].nunique()
print(type(plot_series[0]))
n_bins = plot_series.max()
plot_series.plot.hist(bins=n_bins, ax=ax)

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
#plt.yscale('log', nonposy='clip')
plot_series=ccdb.node_df.groupby('ConnectedComponents')['ConnectedComponents'].count()
n_bins = plot_series.max()
print(type(plot_series[0]))
plot_series.plot.hist(bins=n_bins/2, ax=ax)

In [ ]:
for c in ccdb.components.values():
    print(sum(c.Counter.values()))

In [ ]:
def histogram_of_species(self):
        pass
    
histogram_of_species(ccdb)

In [ ]:
print(ccdb.components[0].Counter)

In [ ]:
dbc = DatabaseComparison(desc_string='50M')
dbc.make_dbs([0.06, 0.0125])

In [ ]:
dbc.databases

In [ ]:
#dbc.databases[2].connected_components.components

In [ ]:
for n, d in dbc.databases.items():
    print(d.connected_components.num_components)
    #print(d.connected_components)
    print(d.connected_components.print_cross_species_connected_component_summaries())

In [ ]:
for n, d in dbc.databases.items():
    d.connected_components.histogram_of_species()

In [ ]:
assert False

In [ ]:
dbc.summary

In [ ]:
dbc.databases

In [ ]:
p = dbc.plot_db_construction_time_vs_n_nodes(logx=True)

In [ ]:
p = dbc.plot_db_construction_time_vs_cutoff()

In [ ]:
p = dbc.plot_db_construction_time_vs_n_nodes(logx=True)

In [ ]:
p = dbc.plot_density_vs_nodes(logx=True)

In [ ]:
p = dbc.plot_base(x='cutoff', y='nodes', color='k', logy=True, figsize=(4.5, 2.5))

In [ ]:
dbc.summary.columns

In [ ]:
p = dbc.plot_base(x='cutoff', y='construction seconds', color='k', logy=True, figsize=(4.5, 2.5))

In [ ]:
p = dbc.plot_base(x='cutoff', y='construction minutes', color='k', logy=False, figsize=(4.5, 2.5))

In [ ]:
dbc.summary.head(2)

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='nodes', c='construction minutes', ax=ax)
plt.xlabel('partial correlation cutoff')

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='construction minutes', y='nodes', c='cutoff', ax=ax)
plt.xlabel('database construction time (minutes)')

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='connected components', c='density', ax=ax)
plt.xlabel('partial correlation cutoff')

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='nodes', y='connected components', c='density', ax=ax)
#plt.xlabel('')

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='nodes', y='connected components', c='cutoff', ax=ax)
#plt.xlabel('')

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(5,3))
dbc.summary.plot.scatter(x='cutoff', y='connected components', c='nodes', ax=ax)
plt.xlabel('partial correlation cutoff')

In [ ]: