In [1]:
import itertools
import os
import numpy as np
import pandas as pd
# Get the root_path for this jupyter notebook repo.
repo_path = os.path.dirname(os.path.abspath(os.getcwd()))
path_files_locus_index = os.path.join(
repo_path, 'files', 'tell-dor', 'tell-dor-area-g-locus-image-index.csv'
)
# Path to the Tell Dor file metadata CSV
path_files = os.path.join(
repo_path, 'files', 'tell-dor', 'tell-dor-files.csv'
)
# Path to the Tell Dor locus metadata CSV
path_loci = os.path.join(
repo_path, 'files', 'tell-dor', 'tell-dor-loci.csv'
)
# Output path for associations between the files and the loci.
path_files_contexts = os.path.join(
repo_path, 'files', 'tell-dor', 'tell-dor-files-contexts.csv'
)
# Read the file - locus index supplied by the Tell Dor team.
fl_df = pd.read_csv(path_files_locus_index)
# Read the file metadata CSV into dataframe f_df.
f_df = pd.read_csv(path_files)
# Read the locus (and wall) CSV into dataframe l_df.
l_df = pd.read_csv(path_loci)
fl_df['Locus_Wall'] = fl_df['Locus_Wall'].astype(str)
fl_df['Locus ID'] = np.nan
for i, row in fl_df.iterrows():
wall_id = 'Wall ' + row['Locus_Wall']
locus_id = 'Locus ' + row['Locus_Wall']
print('Look for {} or {}'.format(wall_id, locus_id))
id_indx = ((l_df['Locus ID']==wall_id)|(l_df['Locus ID']==locus_id))
if l_df[id_indx].empty:
continue
up_indx = (fl_df['Locus_Wall'] == row['Locus_Wall'])
fl_df.loc[up_indx, 'Locus ID'] = l_df[id_indx]['Locus ID'].iloc[0]
print('Update {} with {}'.format(row['Locus_Wall'], l_df[id_indx]['Locus ID'].iloc[0]))
fl_df.to_csv(path_files_locus_index, index=False)
# Set up a dict for File and Locus (and Wall) associations.
file_locus_data = {
'File ID':[],
'Locus ID': [],
}
# Set up a dict for File and Area associations.
# NOTE: An "Area" is an aggregation of multiple squares in the locus/wall
# datafile. Eric grouped these to make search / browsing easier. They
# don't really have any purpose or value for interpretation.
file_square_data = {
'File ID':[],
'Area': [],
}
def add_to_file_context_data(
file_ids,
context_ids,
data,
context_id_col='Locus ID'
):
"""Adds records of file and context associations to a data dict"""
if not isinstance(context_ids, list):
context_ids = [context_ids]
# Get the cross product of all the file_ids and the
# context_ids
crossprod = list(itertools.product(file_ids, context_ids))
data['File ID'] += [c[0] for c in crossprod]
data[context_id_col] += [c[1] for c in crossprod]
return data
In [ ]:
f_df.head(3)
In [ ]:
l_df.head(3)
In [ ]:
# Find matching Loci (including Wall Loci) by matching their IDs
# with text in the file metadata 'Caption' column.
for locus_wall_id in l_df['Locus ID'].unique().tolist():
l_w_id = locus_wall_id.replace('Locus ', 'L').replace('Wall ', 'W')
# l_w_mum_id is for locus or wall IDs that are long unlikely to be
# a false positive, and lack a "L" or "W" in the caption.
l_w_num_id = l_w_id.replace('L', ' ').replace('W', ' ')
if len(l_w_num_id) >= 6:
# Catch cases where the Locus / Wall ID is long like
# '18347'.
l_w_indx = (
f_df['Caption'].str.contains(l_w_id)
| f_df['Caption'].str.contains(l_w_num_id)
)
else:
# The locus / wall id is too short to trust without a
# "L" or "W" prefix.
l_w_indx = f_df['Caption'].str.contains(l_w_id)
if f_df[l_w_indx].empty:
# We didn't find a match, so continue.
continue
print('Found: {} for {} as {}'.format(
len(f_df[l_w_indx]),
locus_wall_id,
l_w_id,
)
)
file_ids = f_df[l_w_indx]['File ID'].unique().tolist()
file_locus_data = add_to_file_context_data(
file_ids,
locus_wall_id,
file_locus_data
)
# Now make a dataframe of the file - locus associations
file_locus_df = pd.DataFrame(data=file_locus_data)
print('File and Locus Associations (Found: {})'.format(
len(file_locus_df.index))
)
In [ ]:
# Find matching Loci (including Wall Loci) by matching their Squares
# with text in the file metadata 'Caption' column.
l_df_sq = l_df[~l_df['Square'].isnull()]
for square in l_df_sq['Square'].astype(str).unique().tolist():
sq_indx = f_df['Caption'].str.contains(square)
if len(square) < 3 or f_df[sq_indx].empty:
# Not enough characters for secure match.
continue
# Get all file_ids that have his square in their captions
file_ids = f_df[sq_indx]['File ID'].unique().tolist()
# Get all the locus ids that are associated with this square
area_ids = l_df[
l_df['Square']==square
]['Area'].unique().tolist()
print('Found: {} files with square {} and {} areas'.format(
len(f_df[sq_indx]),
square,
len(area_ids)
)
)
# Now add to the file_locus_data.
file_square_data = add_to_file_context_data(
file_ids,
area_ids,
file_square_data,
context_id_col='Area'
)
# Now make a dataframe of the file - area associations
file_area_df = pd.DataFrame(data=file_square_data)
print('File and Area Associations (Found: {})'.format(
len(file_area_df.index))
)
In [ ]:
context_df = pd.merge(file_locus_df, file_area_df, on='File ID', how='outer')
context_linked_files = context_df['File ID'].unique().tolist()
print('Found File and Context Associations for {} unique files (total rows: {})'.format(
len(context_linked_files),
len(context_df.index))
)
# Get a list of files that do NOT have context associations
no_context_files = f_df[
~f_df['File ID'].isin(context_linked_files)
]['File ID'].unique().tolist()
file_site_data = {
'File ID':[],
'Site Area': [],
}
file_site_data = add_to_file_context_data(
no_context_files,
'Area G',
file_site_data,
context_id_col='Site Area'
)
site_df = pd.DataFrame(data=file_site_data)
context_df = pd.concat([context_df, site_df], sort=False)
# Set the column order for nice aesthetics
context_df = context_df[['File ID', 'Site Area', 'Area', 'Locus ID']]
context_df.sort_values(by=['File ID', 'Locus ID', 'Area'], inplace=True)
context_df.to_csv(path_files_contexts, index=False)
context_df.head(3)