In [ ]:
from collections import Counter
import glob
import pandas as pd
import re
import subprocess
import matplotlib as mpl
mpl.use('Agg') 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

Analyze one csv:


In [ ]:
tsv_files = []
for filename in glob.iglob('../data_mining_Neo4j_v2_3_2/databases/*.tsv'):
    print(filename)
    tsv_files.append(filename)

In [ ]:
cc = pd.read_csv(tsv_files[0], 
                 usecols=[1, 2, 3, 4], 
                 sep='\t')

In [ ]:
cc.head()

In [ ]:
def assess_connected_components_tsv(tsv):
    results = dict()
    
    # general characteristics:
    results['# nodes in cc'] = tsv.shape[0]  # number of nodes in connected components
    results['# organisms in cc'] = len(tsv['organism'].unique().tolist())
    
    num_components = len(tsv['ConnectedComponents'].unique().tolist())
    print("num unique connected components: {}".format(num_components))
    results['# of components'] = num_components
    
    results['organism counts'] = dict(Counter(tsv['organism'])) #dict(Counter(tsv['organism']))
    
    return results

In [ ]:
def assess_connected_components_tsvs(cc_files):
    summary = pd.DataFrame()
    for cc in cc_files:
        tsv = pd.read_csv(cc, usecols=[1, 2, 3, 4], sep='\t')
        print(tsv.shape)
        info_dict = assess_connected_components_tsv(tsv)
        
        # get file name
        m = re.search('/(db_binary_[.0-9]+.tsv)', cc) #.groups(1)
        info_dict['file'] = m.group(1)
        
        # get ready for Pandas
        for k, v in info_dict.items():
            info_dict[k] = [v]
        print(info_dict)
        info_df_row = pd.DataFrame(info_dict)
        summary = pd.concat([summary, info_df_row], axis=0)
        print('summary shape: {}'.format(summary.shape))
    print(summary)
    print(summary.shape[0])
    return summary

In [ ]:
assess_connected_components_tsvs(tsv_files)

In [ ]:
def assess_sub_graphs(cc_files):
    summary = pd.DataFrame()
    for cc_file in cc_files: 
        
        tsv = pd.read_csv(cc_file, usecols=[1, 2, 3, 4], sep='\t')
        components = dict(Counter(tsv['ConnectedComponents'])) 
        for c in components.keys():
            print(c)
            c_info = dict()
            c_info['ConnectedComponent'] = c
            c_info['Cutoff'] = None  # TODO: parse from file name. 
            
            nodes = tsv[tsv['ConnectedComponents'] == c]
            species_counts = dict(Counter(tsv['organism']))
            c_info['nodes'] = nodes.shape[0]
            c_info['species counts'] = species_counts
            
            c_info['cross-species'] = len(species_counts.keys()) > 1
            
            # get file name
            m = re.search('/(db_binary_[.0-9]+.tsv)', cc_file) #.groups(1)
            c_info['file'] = m.group(1)
            
            for k, v in c_info.items():
                c_info[k] = [v]
            print(c_info)
        
            c_info = pd.DataFrame(c_info)
            summary = pd.concat([summary, c_info], axis=0)
        print('summary shape: {}'.format(summary.shape))
    print(summary)
    print(summary.shape[0])
    return summary

In [ ]:
connected_components = assess_sub_graphs(tsv_files)

In [ ]:
connected_components.head()

In [ ]:
connected_components.plot.scatter(x=)

In [ ]:
assess_connected_components_tsv(cc)

In [ ]:
pd.DataFrame.from_dict(assess_connected_components_tsv(cc))

In [ ]:
def assess_component(cc):
    # Metrics for a signle connected component.
    # return counts of each organism,
    # entropy (?)
    pass

In [ ]: