In [1]:
from collections import defaultdict
from itertools import takewhile
from pprint import pprint
DATA_PATH = '../data/chart_tabs.txt'
In [2]:
is_tab = '\t'.__eq__
def tree(): return defaultdict(tree)
def add(t, path):
for node in path:
t = t[node]
def dicts(t): return {k: dicts(t[k]) for k in t}
def build_tree(lines):
lines = iter(lines)
path = []
ret = tree()
for line in lines:
entry = line.lstrip()
indent = len(list(takewhile(is_tab, line)))
path[indent:] = [entry]
add(ret, path)
return ret
def build_trace(lines):
lines = iter(lines)
path = []
ret = []
for line in lines:
entry = line.lstrip()
indent = len(list(takewhile(is_tab, line)))
path[indent:] = [entry]
if entry[0].lower() == entry[0]:
ret.append(tuple(path[::-1]))
return ret
In [52]:
with open(DATA_PATH, 'r') as f:
lines = [l.rstrip('\n') for l in f.readlines()]
t = build_tree(lines)
trace = build_trace(lines)
In [4]:
len(trace)
Out[4]:
121
In [36]:
pprint(dicts(t))
{'Omega': {'Artifact': {'artifacts': {}, 'artifacts_edge': {}},
'Plankton': {'Chaetognath': {'chaetognath_non_sagitta': {},
'chaetognath_other': {},
'chaetognath_sagitta': {}},
'Crustacean': {'Copepod': {'Copepod_Calnoid': {'Copepod_Calanoid_Large': {'copepod_calanoid_large': {},
'copepod_calanoid_large_side_antennatucked': {}},
'copepod_calanoid': {},
'copepod_calanoid_eggs': {},
'copepod_calanoid_eucalanus': {},
'copepod_calanoid_flatheads': {},
'copepod_calanoid_frillyAntennae': {},
'copepod_calanoid_octomoms': {},
'copepod_calanoid_small_longantennae': {},
'copepod_other': {}},
'Cyclopoid_Copepod': {'Cyclopoid_Copepod_Oithona': {'copepod_cyclopoid_oithona': {},
'copepod_cyclopoid_oithona_eggs': {}},
'copepod_cyclopoid_copilia': {}}},
'Shrimp-like': {'Decapod': {'decapods': {},
'shrimp_caridean': {},
'shrimp_sergestidae': {},
'shrimp_zoea': {}},
'Euphauslid': {'euphausiids': {},
'euphausiids_young': {}},
'shrimp-like_other': {}},
'amphipods': {},
'crustacean_other': {},
'stomatopod': {}},
'Detritus': {'detritus_blob': {},
'detritus_filamentous': {},
'detritus_other': {},
'fecal_pellet': {}},
'Diatom': {'diatom_chain_string': {},
'diatom_chain_tube': {}},
'Fish': {'fish_larvae_deep_body': {},
'fish_larvae_leptocephali': {},
'fish_larvae_medium_body': {},
'fish_larvae_myctophids': {},
'fish_larvae_thin_body': {},
'fish_larvae_very_thin_body': {}},
'Gastropod': {'Pteropod': {'pteropod_butterfly': {},
'pteropod_theco_dev_seq': {},
'pteropod_triangle': {}},
'heteropod': {}},
'Gelatinous_Zooplankton': {'Ctenophore': {'ctenophore_cestid': {},
'ctenophore_cydippid_no_tentacles': {},
'ctenophore_cydippid_tentacles': {},
'ctenophore_lobate': {}},
'Hydromedusae': {'Hydromedusae_Narcomedusae': {'Hydromedusae_Haliscera ': {'hydromedusae_haliscera': {},
'hydromedusae_haliscera_small_sideview': {}},
'Hydromedusae_Solmaris': {'hydromedusae_narco_young': {},
'hydromedusae_solmaris': {}},
'Other_Hydromedusae': {'Hydromedusae_Shape_A_Sideview': {'hydromedusae_shapeA': {},
'hydromedusae_shapeA_sideview_small': {},
'hydromedusae_sideview_big': {}},
'Hydromedusae_Type_D_Bell_And_Tentacles': {'hydromedusae_bell_and_tentacles': {},
'hydromedusae_typeD': {},
'hydromedusae_typeD_bell_and_tentacles': {}},
'hydromedusae_h15': {},
'hydromedusae_other': {},
'hydromedusae_partial_dark': {},
'hydromedusae_shapeB': {},
'hydromedusae_typeE': {},
'hydromedusae_typeF': {}},
'hydromedusae_aglaura': {},
'hydromedusae_liriope': {},
'hydromedusae_narco_dark': {},
'hydromedusae_narcomedusae': {},
'hydromedusae_solmundella': {}}},
'Pelagic_Tunicate': {'Appendicularian': {'appendicularian_fritillaridae': {},
'appendicularian_s_shape': {},
'appendicularian_slight_curve': {},
'appendicularian_straight': {}},
'Tunicate': {'Tunicate_Doliolid': {'tunicate_doliolid': {},
'tunicate_doliolid_nurse': {}},
'Tunicate_Salp': {'tunicate_salp': {},
'tunicate_salp_chains': {}},
'tunicate_partial': {}}},
'Siphonophore': {'Calycophoran_Siphonophore': {'Calycophoran_Siphonophore_Rocketship': {'siphonophore_calycophoran_rocketship_adult': {},
'siphonophore_calycophoran_rocketship_young': {}},
'Calycophoran_Siphonophore_Sphaeronectes': {'siphonophore_calycophoran_sphaeronectes': {},
'siphonophore_calycophoran_sphaeronectes_stem': {},
'siphonophore_calycophoran_sphaeronectes_young': {}},
'siphonophore_calycophoran_abylidae': {}},
'Siphonophore_Physonect': {'siphonophore_physonect': {},
'siphonophore_physonect_young': {}},
'siphonophore_other_parts': {},
'siphonophore_partial': {}},
'ephyra': {},
'jellies_tentacles': {}},
'Other_Invert_Larvae': {'Echinoderm': {'Echinoderm_Larva_Seastar': {'echinoderm_larva_seastar_bipinnaria': {},
'echinoderm_larva_seastar_brachiolaria': {}},
'echinoderm_larva_pluteus_brittlestar': {},
'echinoderm_larva_pluteus_early': {},
'echinoderm_larva_pluteus_typeC': {},
'echinoderm_larva_pluteus_urchin': {},
'echinoderm_seacucumber_auricularia_larva': {},
'echinopluteus': {}},
'invertebrate_larvae_other_A': {},
'invertebrate_larvae_other_B': {},
'tornaria_acorn_worm_larvae': {},
'trochophore_larvae': {}},
'Protist': {'Acantharia_Protist': {'acantharia_protist': {},
'acantharia_protist_big_center': {},
'acantharia_protist_halo': {}},
'Protist_Other': {'protist_dark_center': {},
'protist_fuzzy_olive': {},
'protist_other': {},
'protist_star': {}},
'Radiolarian_Colony': {'radiolarian_chain': {},
'radiolarian_colony': {}},
'protist_noctiluca': {}},
'Trichodesmium': {'trichodesmium_bowtie': {},
'trichodesmium_multiple': {},
'trichodesmium_puff': {},
'trichodesmium_tuft': {}},
'Unknown': {'unknown_blobs_and_smudges': {},
'unknown_sticks': {},
'unknown_unclassified': {}},
'chordate_type1': {},
'polychaete': {}}}}
In [14]:
# extending out deep leaves
max_len = max([len(p) for p in trace])
trace_ext = []
for path in trace:
while len(path) < max_len:
path = (path[0],) + path
trace_ext.append(path)
In [23]:
for n in range(max_len):
print n, len({p[n] for p in trace_ext})
trace_d = {p[0]: list(p[1:]) for p in trace_ext}
0 121
1 117
2 101
3 72
4 48
5 15
6 2
7 1
In [38]:
from sklearn.preprocessing import LabelEncoder
depth_le = {n: LabelEncoder().fit(sorted(list({p[n] for p in trace_ext})))
for n in range(max_len)}
In [51]:
q = depth_le[5]
q.classes_
Out[51]:
array(['Chaetognath', 'Crustacean', 'Detritus', 'Diatom', 'Fish',
'Gastropod', 'Gelatinous_Zooplankton', 'Other_Invert_Larvae',
'Protist', 'Trichodesmium', 'Unknown', 'artifacts',
'artifacts_edge', 'chordate_type1', 'polychaete'],
dtype='|S22')
In [13]:
import taxonomy as tax
for n in range(0,8):
qq = {p[n] for p in tax.trace_ext}
print n, len(qq), qq
print
0 121 set(['echinoderm_larva_seastar_bipinnaria', 'unknown_sticks', 'tornaria_acorn_worm_larvae', 'echinoderm_seacucumber_auricularia_larva', 'ctenophore_lobate', 'pteropod_triangle', 'ctenophore_cestid', 'appendicularian_slight_curve', 'detritus_blob', 'chaetognath_non_sagitta', 'tunicate_doliolid', 'copepod_calanoid_octomoms', 'shrimp_caridean', 'hydromedusae_typeE', 'hydromedusae_typeD', 'hydromedusae_narco_young', 'siphonophore_calycophoran_rocketship_young', 'hydromedusae_haliscera_small_sideview', 'chaetognath_sagitta', 'hydromedusae_narco_dark', 'shrimp_zoea', 'ctenophore_cydippid_tentacles', 'hydromedusae_h15', 'acantharia_protist_halo', 'copepod_other', 'copepod_calanoid_large_side_antennatucked', 'copepod_calanoid_eggs', 'copepod_calanoid_eucalanus', 'copepod_calanoid_flatheads', 'chordate_type1', 'hydromedusae_solmundella', 'heteropod', 'hydromedusae_aglaura', 'radiolarian_colony', 'stomatopod', 'trichodesmium_multiple', 'copepod_cyclopoid_oithona', 'pteropod_butterfly', 'fish_larvae_leptocephali', 'tunicate_salp', 'protist_other', 'detritus_other', 'echinoderm_larva_pluteus_urchin', 'radiolarian_chain', 'unknown_blobs_and_smudges', 'crustacean_other', 'invertebrate_larvae_other_B', 'tunicate_salp_chains', 'fecal_pellet', 'siphonophore_other_parts', 'siphonophore_calycophoran_sphaeronectes_stem', 'trochophore_larvae', 'acantharia_protist', 'hydromedusae_shapeB', 'hydromedusae_shapeA', 'fish_larvae_thin_body', 'fish_larvae_very_thin_body', 'protist_dark_center', 'hydromedusae_other', 'copepod_calanoid_large', 'fish_larvae_myctophids', 'amphipods', 'siphonophore_calycophoran_sphaeronectes_young', 'siphonophore_calycophoran_sphaeronectes', 'hydromedusae_partial_dark', 'trichodesmium_tuft', 'pteropod_theco_dev_seq', 'hydromedusae_sideview_big', 'appendicularian_fritillaridae', 'hydromedusae_haliscera', 'appendicularian_straight', 'hydromedusae_narcomedusae', 'shrimp-like_other', 'siphonophore_partial', 'hydromedusae_solmaris', 'ephyra', 'artifacts', 'trichodesmium_puff', 'protist_star', 'echinoderm_larva_seastar_brachiolaria', 'trichodesmium_bowtie', 'hydromedusae_typeF', 'echinoderm_larva_pluteus_early', 'copepod_calanoid_small_longantennae', 'unknown_unclassified', 'artifacts_edge', 'detritus_filamentous', 'siphonophore_physonect_young', 'copepod_calanoid', 'hydromedusae_shapeA_sideview_small', 'siphonophore_calycophoran_abylidae', 'fish_larvae_deep_body', 'echinopluteus', 'hydromedusae_liriope', 'siphonophore_calycophoran_rocketship_adult', 'siphonophore_physonect', 'appendicularian_s_shape', 'echinoderm_larva_pluteus_typeC', 'copepod_calanoid_frillyAntennae', 'invertebrate_larvae_other_A', 'hydromedusae_typeD_bell_and_tentacles', 'jellies_tentacles', 'decapods', 'protist_fuzzy_olive', 'copepod_cyclopoid_oithona_eggs', 'shrimp_sergestidae', 'hydromedusae_bell_and_tentacles', 'fish_larvae_medium_body', 'chaetognath_other', 'euphausiids_young', 'ctenophore_cydippid_no_tentacles', 'diatom_chain_string', 'copepod_cyclopoid_copilia', 'diatom_chain_tube', 'euphausiids', 'tunicate_partial', 'acantharia_protist_big_center', 'echinoderm_larva_pluteus_brittlestar', 'tunicate_doliolid_nurse', 'polychaete', 'protist_noctiluca'])
1 117 set(['echinoderm_larva_seastar_bipinnaria', 'unknown_sticks', 'tornaria_acorn_worm_larvae', 'echinoderm_seacucumber_auricularia_larva', 'shrimp_zoea', 'pteropod_triangle', 'ctenophore_cestid', 'appendicularian_slight_curve', 'detritus_blob', 'chaetognath_non_sagitta', 'tunicate_doliolid', 'copepod_calanoid_octomoms', 'hydromedusae_typeE', 'hydromedusae_narco_young', 'siphonophore_calycophoran_rocketship_young', 'hydromedusae_haliscera_small_sideview', 'chaetognath_sagitta', 'hydromedusae_narco_dark', 'ctenophore_lobate', 'ctenophore_cydippid_tentacles', 'hydromedusae_h15', 'acantharia_protist_halo', 'copepod_other', 'copepod_calanoid_large_side_antennatucked', 'copepod_calanoid_eggs', 'copepod_calanoid_eucalanus', 'copepod_calanoid_flatheads', 'chordate_type1', 'hydromedusae_solmundella', 'heteropod', 'hydromedusae_aglaura', 'radiolarian_colony', 'stomatopod', 'trichodesmium_multiple', 'copepod_cyclopoid_oithona', 'pteropod_butterfly', 'fish_larvae_leptocephali', 'tunicate_salp', 'protist_other', 'detritus_other', 'echinoderm_larva_pluteus_urchin', 'radiolarian_chain', 'unknown_blobs_and_smudges', 'crustacean_other', 'invertebrate_larvae_other_B', 'tunicate_salp_chains', 'fecal_pellet', 'siphonophore_other_parts', 'siphonophore_calycophoran_sphaeronectes_stem', 'trochophore_larvae', 'acantharia_protist', 'hydromedusae_shapeB', 'fish_larvae_thin_body', 'fish_larvae_very_thin_body', 'protist_dark_center', 'hydromedusae_other', 'copepod_calanoid_large', 'fish_larvae_myctophids', 'amphipods', 'siphonophore_calycophoran_sphaeronectes_young', 'siphonophore_calycophoran_sphaeronectes', 'hydromedusae_partial_dark', 'trichodesmium_tuft', 'pteropod_theco_dev_seq', 'appendicularian_fritillaridae', 'hydromedusae_haliscera', 'appendicularian_straight', 'hydromedusae_narcomedusae', 'shrimp-like_other', 'siphonophore_partial', 'hydromedusae_solmaris', 'ephyra', 'artifacts', 'trichodesmium_puff', 'protist_star', 'echinoderm_larva_seastar_brachiolaria', 'trichodesmium_bowtie', 'hydromedusae_typeF', 'echinoderm_larva_pluteus_early', 'copepod_calanoid_small_longantennae', 'unknown_unclassified', 'Hydromedusae_Shape_A_Sideview', 'artifacts_edge', 'detritus_filamentous', 'siphonophore_physonect_young', 'copepod_calanoid', 'shrimp_caridean', 'siphonophore_calycophoran_abylidae', 'fish_larvae_deep_body', 'echinopluteus', 'hydromedusae_liriope', 'siphonophore_calycophoran_rocketship_adult', 'siphonophore_physonect', 'appendicularian_s_shape', 'Hydromedusae_Type_D_Bell_And_Tentacles', 'echinoderm_larva_pluteus_typeC', 'copepod_calanoid_frillyAntennae', 'invertebrate_larvae_other_A', 'jellies_tentacles', 'decapods', 'protist_fuzzy_olive', 'copepod_cyclopoid_oithona_eggs', 'shrimp_sergestidae', 'fish_larvae_medium_body', 'chaetognath_other', 'euphausiids_young', 'ctenophore_cydippid_no_tentacles', 'diatom_chain_string', 'copepod_cyclopoid_copilia', 'diatom_chain_tube', 'euphausiids', 'tunicate_partial', 'acantharia_protist_big_center', 'echinoderm_larva_pluteus_brittlestar', 'tunicate_doliolid_nurse', 'polychaete', 'protist_noctiluca'])
2 101 set(['echinoderm_larva_seastar_bipinnaria', 'unknown_sticks', 'tornaria_acorn_worm_larvae', 'echinoderm_seacucumber_auricularia_larva', 'ctenophore_lobate', 'pteropod_triangle', 'ctenophore_cestid', 'appendicularian_slight_curve', 'invertebrate_larvae_other_A', 'chaetognath_non_sagitta', 'copepod_calanoid_octomoms', 'chaetognath_sagitta', 'hydromedusae_narco_dark', 'shrimp_zoea', 'Cyclopoid_Copepod_Oithona', 'ctenophore_cydippid_tentacles', 'acantharia_protist_halo', 'copepod_other', 'siphonophore_physonect_young', 'copepod_calanoid_eggs', 'copepod_calanoid_eucalanus', 'radiolarian_colony', 'chordate_type1', 'hydromedusae_solmundella', 'heteropod', 'hydromedusae_aglaura', 'copepod_calanoid_flatheads', 'stomatopod', 'trichodesmium_multiple', 'pteropod_butterfly', 'Copepod_Calanoid_Large', 'fish_larvae_leptocephali', 'protist_other', 'detritus_other', 'echinoderm_larva_pluteus_urchin', 'radiolarian_chain', 'unknown_blobs_and_smudges', 'fish_larvae_medium_body', 'invertebrate_larvae_other_B', 'fecal_pellet', 'siphonophore_other_parts', 'echinoderm_larva_pluteus_brittlestar', 'trochophore_larvae', 'acantharia_protist', 'trichodesmium_puff', 'fish_larvae_thin_body', 'fish_larvae_very_thin_body', 'protist_dark_center', 'fish_larvae_myctophids', 'amphipods', 'Calycophoran_Siphonophore_Rocketship', 'trichodesmium_tuft', 'pteropod_theco_dev_seq', 'appendicularian_fritillaridae', 'appendicularian_straight', 'hydromedusae_narcomedusae', 'shrimp-like_other', 'siphonophore_partial', 'Hydromedusae_Solmaris', 'ephyra', 'artifacts', 'echinopluteus', 'protist_star', 'echinoderm_larva_seastar_brachiolaria', 'trichodesmium_bowtie', 'echinoderm_larva_pluteus_early', 'copepod_calanoid_small_longantennae', 'unknown_unclassified', 'artifacts_edge', 'detritus_filamentous', 'copepod_calanoid', 'shrimp_caridean', 'siphonophore_calycophoran_abylidae', 'fish_larvae_deep_body', 'hydromedusae_liriope', 'siphonophore_physonect', 'Tunicate_Salp', 'appendicularian_s_shape', 'echinoderm_larva_pluteus_typeC', 'copepod_calanoid_frillyAntennae', 'Other_Hydromedusae', 'Hydromedusae_Haliscera ', 'jellies_tentacles', 'decapods', 'protist_fuzzy_olive', 'shrimp_sergestidae', 'crustacean_other', 'chaetognath_other', 'euphausiids_young', 'ctenophore_cydippid_no_tentacles', 'diatom_chain_string', 'copepod_cyclopoid_copilia', 'diatom_chain_tube', 'Calycophoran_Siphonophore_Sphaeronectes', 'euphausiids', 'tunicate_partial', 'Tunicate_Doliolid', 'acantharia_protist_big_center', 'detritus_blob', 'polychaete', 'protist_noctiluca'])
3 72 set(['trichodesmium_multiple', 'Calycophoran_Siphonophore', 'unknown_sticks', 'echinoderm_seacucumber_auricularia_larva', 'polychaete', 'trichodesmium_tuft', 'ctenophore_lobate', 'pteropod_triangle', 'ctenophore_cestid', 'fish_larvae_leptocephali', 'pteropod_theco_dev_seq', 'echinoderm_larva_pluteus_typeC', 'protist_other', 'invertebrate_larvae_other_A', 'Tunicate', 'detritus_other', 'Hydromedusae_Narcomedusae', 'shrimp-like_other', 'siphonophore_partial', 'jellies_tentacles', 'fish_larvae_very_thin_body', 'ephyra', 'artifacts', 'acantharia_protist', 'tornaria_acorn_worm_larvae', 'radiolarian_chain', 'echinopluteus', 'protist_star', 'crustacean_other', 'invertebrate_larvae_other_B', 'protist_fuzzy_olive', 'trichodesmium_bowtie', 'chaetognath_non_sagitta', 'Euphauslid', 'Appendicularian', 'echinoderm_larva_pluteus_early', 'chaetognath_sagitta', 'Siphonophore_Physonect', 'pteropod_butterfly', 'unknown_unclassified', 'echinoderm_larva_pluteus_urchin', 'fish_larvae_medium_body', 'Copepod_Calnoid', 'chaetognath_other', 'artifacts_edge', 'siphonophore_other_parts', 'ctenophore_cydippid_tentacles', 'ctenophore_cydippid_no_tentacles', 'echinoderm_larva_pluteus_brittlestar', 'diatom_chain_string', 'unknown_blobs_and_smudges', 'trochophore_larvae', 'acantharia_protist_halo', 'diatom_chain_tube', 'trichodesmium_puff', 'fish_larvae_deep_body', 'detritus_filamentous', 'fish_larvae_thin_body', 'detritus_blob', 'Decapod', 'acantharia_protist_big_center', 'protist_dark_center', 'Cyclopoid_Copepod', 'radiolarian_colony', 'chordate_type1', 'heteropod', 'Echinoderm_Larva_Seastar', 'fecal_pellet', 'fish_larvae_myctophids', 'stomatopod', 'amphipods', 'protist_noctiluca'])
4 48 set(['trichodesmium_multiple', 'polychaete', 'unknown_sticks', 'tornaria_acorn_worm_larvae', 'Shrimp-like', 'trichodesmium_tuft', 'Pelagic_Tunicate', 'Copepod', 'fish_larvae_leptocephali', 'Protist_Other', 'invertebrate_larvae_other_A', 'Acantharia_Protist', 'detritus_other', 'chaetognath_non_sagitta', 'jellies_tentacles', 'fish_larvae_very_thin_body', 'ephyra', 'artifacts', 'Radiolarian_Colony', 'Pteropod', 'unknown_blobs_and_smudges', 'crustacean_other', 'invertebrate_larvae_other_B', 'trichodesmium_bowtie', 'chaetognath_sagitta', 'unknown_unclassified', 'Hydromedusae', 'fish_larvae_medium_body', 'trochophore_larvae', 'chaetognath_other', 'artifacts_edge', 'detritus_filamentous', 'Echinoderm', 'diatom_chain_string', 'detritus_blob', 'Ctenophore', 'diatom_chain_tube', 'trichodesmium_puff', 'fish_larvae_deep_body', 'fish_larvae_thin_body', 'Siphonophore', 'chordate_type1', 'heteropod', 'fecal_pellet', 'fish_larvae_myctophids', 'stomatopod', 'amphipods', 'protist_noctiluca'])
5 15 set(['Gelatinous_Zooplankton', 'Chaetognath', 'Other_Invert_Larvae', 'Detritus', 'Fish', 'chordate_type1', 'artifacts_edge', 'Protist', 'artifacts', 'Crustacean', 'Trichodesmium', 'Diatom', 'Unknown', 'polychaete', 'Gastropod'])
6 2 set(['Artifact', 'Plankton'])
7 1 set(['Omega'])