In [ ]:
import os
import pandas as pd
import re
import subprocess
import sys
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline
In [ ]:
sys.path.append('../code/')
from database_comparisons import Database, DatabaseComparison
from connected_component import ConnectedComponentsDB
In [ ]:
# for Waffle, until the default Java is set to 8:
#java="/usr/lib/jvm/java-8-oracle/jre/bin/java"
# for badger, set java = 'java'
java='java'
In [ ]:
! pwd
In [ ]:
! ls -l ../jars/*.jar
In [ ]:
files = os.listdir('../data_mining_Neo4j_v2_3_2/databases')
files = [f for f in files if 'db_50M_' in f and '.tsv' in f]
files
In [ ]:
import seaborn as sns; sns.set()
flights_orig = sns.load_dataset("flights")
flights = flights_orig.pivot("month", "year", "passengers")
g = sns.clustermap(flights)
In [ ]:
flights_orig.head(3)
In [ ]:
flights_orig.pivot("month", "year", "passengers").head(3)
In [ ]:
ccdb = ConnectedComponentsDB(cutoff=0.02, desc_string='50M')
In [ ]:
def heatmap_of_organisms_appearance_in_components(self):
#print(self.node_df.columns)
g = self.node_df.groupby(['ConnectedComponents', 'organism']).organism #.count()
for t, d in g:
#print(t)
pass
g2 = pd.DataFrame(g.count())
print(g2.columns)
g2.rename(columns={'organism':'count(organism)'}, inplace=True)
g2.reset_index(['ConnectedComponents'], inplace=True)
g2.index= g2.index.str.replace('[ ]+\(UID[0-9]+\)', '')
return g2
In [ ]:
#heatmap_of_organisms_appearance_in_components(ccdb) #.index.str.replace('[ ]+\(UID[0-9]+\)', '')
In [ ]:
sns.palplot(sns.cubehelix_palette(start=2.8, rot=.1, light=1, dark=0))
In [ ]:
sns.palplot(sns.cubehelix_palette(start=2.8, rot=.01, light=1, dark=0))
In [ ]:
p = ccdb.heatmap_of_organisms_appearance_in_components()
In [ ]:
fname = './161213_heatmap_cutof_0.02'
p.savefig(fname + '.pdf')
dpi=300
#p.savefig(fname + '_{}_dpi.png'.format(dpi), dpi=dpi)
In [ ]:
hs = ccdb.histogram_of_species(figsize=(4, 2))
plt.ylabel('# of connected\ncomponents')
#hs.savefig('161213_hist_species_0.02.png', dpi=600)
In [ ]:
def histogram_of_species(self):
print('Cutoff {}: plot # of different species for each connected component.'
''.format(self.cutoff))
fig, ax = plt.subplots(1, 1, figsize=(5,3))
#plt.yscale('log', nonposy='clip')
plot_series = self.node_df.groupby('ConnectedComponents')['organism'].nunique()
n_bins = plot_series.max()
if n_bins > 50:
n_bins = int(n_bins/2.)
plot_series.plot.hist(bins=n_bins, ax=ax)
ax.set_xlabel('# species in connected component')
ax.set_ylabel('# of connected components')
plt.tight_layout()
return fig
In [ ]:
df = pd.DataFrame(np.abs(np.random.randn(10,10)),columns=['A','B','C','D','E','F','G','H','I','J'], index=range(10))
df.plot(kind='bar',stacked=True,figsize=(10,5))
colors = plt.cm.GnBu(np.linspace(0, 1, 10))
df.plot(kind='bar', stacked=True, figsize=(10, 5), color=colors)
In [ ]: