In [ ]:
from collections import Counter
import glob
import pandas as pd
import re
import subprocess
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [ ]:
tsv_files = []
for filename in glob.iglob('../data_mining_Neo4j_v2_3_2/databases/*.tsv'):
print(filename)
tsv_files.append(filename)
In [ ]:
cc = pd.read_csv(tsv_files[0],
usecols=[1, 2, 3, 4],
sep='\t')
In [ ]:
cc.head()
In [ ]:
def assess_connected_components_tsv(tsv):
results = dict()
# general characteristics:
results['# nodes in cc'] = tsv.shape[0] # number of nodes in connected components
results['# organisms in cc'] = len(tsv['organism'].unique().tolist())
num_components = len(tsv['ConnectedComponents'].unique().tolist())
print("num unique connected components: {}".format(num_components))
results['# of components'] = num_components
results['organism counts'] = dict(Counter(tsv['organism'])) #dict(Counter(tsv['organism']))
return results
In [ ]:
def assess_connected_components_tsvs(cc_files):
summary = pd.DataFrame()
for cc in cc_files:
tsv = pd.read_csv(cc, usecols=[1, 2, 3, 4], sep='\t')
print(tsv.shape)
info_dict = assess_connected_components_tsv(tsv)
# get file name
m = re.search('/(db_binary_[.0-9]+.tsv)', cc) #.groups(1)
info_dict['file'] = m.group(1)
# get ready for Pandas
for k, v in info_dict.items():
info_dict[k] = [v]
print(info_dict)
info_df_row = pd.DataFrame(info_dict)
summary = pd.concat([summary, info_df_row], axis=0)
print('summary shape: {}'.format(summary.shape))
print(summary)
print(summary.shape[0])
return summary
In [ ]:
assess_connected_components_tsvs(tsv_files)
In [ ]:
def assess_sub_graphs(cc_files):
summary = pd.DataFrame()
for cc_file in cc_files:
tsv = pd.read_csv(cc_file, usecols=[1, 2, 3, 4], sep='\t')
components = dict(Counter(tsv['ConnectedComponents']))
for c in components.keys():
print(c)
c_info = dict()
c_info['ConnectedComponent'] = c
c_info['Cutoff'] = None # TODO: parse from file name.
nodes = tsv[tsv['ConnectedComponents'] == c]
species_counts = dict(Counter(tsv['organism']))
c_info['nodes'] = nodes.shape[0]
c_info['species counts'] = species_counts
c_info['cross-species'] = len(species_counts.keys()) > 1
# get file name
m = re.search('/(db_binary_[.0-9]+.tsv)', cc_file) #.groups(1)
c_info['file'] = m.group(1)
for k, v in c_info.items():
c_info[k] = [v]
print(c_info)
c_info = pd.DataFrame(c_info)
summary = pd.concat([summary, c_info], axis=0)
print('summary shape: {}'.format(summary.shape))
print(summary)
print(summary.shape[0])
return summary
In [ ]:
connected_components = assess_sub_graphs(tsv_files)
In [ ]:
connected_components.head()
In [ ]:
connected_components.plot.scatter(x=)
In [ ]:
assess_connected_components_tsv(cc)
In [ ]:
pd.DataFrame.from_dict(assess_connected_components_tsv(cc))
In [ ]:
def assess_component(cc):
# Metrics for a signle connected component.
# return counts of each organism,
# entropy (?)
pass
In [ ]: