In [ ]:
import os
import pandas as pd
import re
import subprocess
import sys
import matplotlib as mpl
mpl.use('Agg') 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

%matplotlib inline

In [ ]:
sys.path.append('../code/')

from database_comparisons import Database, DatabaseComparison
from connected_component import ConnectedComponentsDB

In [ ]:
# for Waffle, until the default Java is set to 8:
#java="/usr/lib/jvm/java-8-oracle/jre/bin/java"
# for badger, set java = 'java'
java='java'

In [ ]:
! pwd

In [ ]:
! ls -l ../jars/*.jar
! ls -l ../data_mining_Neo4j_v2_3_2/databases/

In [ ]:
files = os.listdir('../data_mining_Neo4j_v2_3_2/databases')
files = [f for f in files if 'db_50M_' in f and '.tsv' in f]
files

In [ ]:
import seaborn as sns; sns.set()
flights_orig = sns.load_dataset("flights")
flights = flights_orig.pivot("month", "year", "passengers")
g = sns.clustermap(flights)

In [ ]:
flights_orig.head(3)

In [ ]:
flights_orig.pivot("month", "year", "passengers").head(3)

In [ ]:
ccdb = ConnectedComponentsDB(cutoff=0.02, desc_string='50M')

In [ ]:
def heatmap_of_organisms_appearance_in_components(self):
        #print(self.node_df.columns)
        g = self.node_df.groupby(['ConnectedComponents', 'organism']).organism #.count()
        for t, d in g:
            #print(t)
            pass
        g2 = pd.DataFrame(g.count())
        print(g2.columns)
        g2.rename(columns={'organism':'count(organism)'}, inplace=True)
        g2.reset_index(['ConnectedComponents'], inplace=True)
        g2.index= g2.index.str.replace('[ ]+\(UID[0-9]+\)', '')
        return g2

In [ ]:
#heatmap_of_organisms_appearance_in_components(ccdb) #.index.str.replace('[ ]+\(UID[0-9]+\)', '')

In [ ]:
sns.palplot(sns.cubehelix_palette(start=2.8, rot=.1, light=1, dark=0))

In [ ]:
sns.palplot(sns.cubehelix_palette(start=2.8, rot=.01, light=1, dark=0))
d = pd.DataFrame(heatmap_of_organisms_appearance_in_components(ccdb)) dp = d.pivot(columns='ConnectedComponents', values='count(organism)').fillna(0) #dp #.min() pal = sns.cubehelix_palette(start=2.8, rot=.1, light=1, dark=0, as_cmap=True) #sns.dark_palette("palegreen", as_cmap=True) g = sns.clustermap(dp, cmap=pal) #.reset_index() plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0) plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90) g #plt.setp(dp.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

In [ ]:
p = ccdb.heatmap_of_organisms_appearance_in_components()

In [ ]:
fname = './161213_heatmap_cutof_0.02'
p.savefig(fname + '.pdf')
dpi=300
#p.savefig(fname + '_{}_dpi.png'.format(dpi), dpi=dpi)

In [ ]:
hs = ccdb.histogram_of_species(figsize=(4, 2))
plt.ylabel('# of connected\ncomponents')
#hs.savefig('161213_hist_species_0.02.png', dpi=600)

In [ ]:
def histogram_of_species(self):
        print('Cutoff {}: plot # of different species for each connected component.'
              ''.format(self.cutoff))
        fig, ax = plt.subplots(1, 1, figsize=(5,3))
        #plt.yscale('log', nonposy='clip')
        plot_series = self.node_df.groupby('ConnectedComponents')['organism'].nunique()
        n_bins = plot_series.max()
        if n_bins > 50:
            n_bins = int(n_bins/2.)
        plot_series.plot.hist(bins=n_bins, ax=ax)
        ax.set_xlabel('# species in connected component')
        ax.set_ylabel('# of connected components')
        plt.tight_layout()
        return fig

In [ ]:
df = pd.DataFrame(np.abs(np.random.randn(10,10)),columns=['A','B','C','D','E','F','G','H','I','J'], index=range(10))
df.plot(kind='bar',stacked=True,figsize=(10,5))
colors = plt.cm.GnBu(np.linspace(0, 1, 10))
df.plot(kind='bar', stacked=True, figsize=(10, 5), color=colors)

In [ ]: