notebook.community

Edit and run



In [1]:

    
import itertools
import os
import numpy as np
import pandas as pd



# Get the root_path for this jupyter notebook repo.
repo_path = os.path.dirname(os.path.abspath(os.getcwd()))

path_files_locus_index = os.path.join(
    repo_path, 'files', 'tell-dor', 'tell-dor-area-g-locus-image-index.csv'
)

# Path to the Tell Dor file metadata CSV
path_files = os.path.join(
    repo_path, 'files', 'tell-dor', 'tell-dor-files.csv'
)
# Path to the Tell Dor locus metadata CSV 
path_loci = os.path.join(
    repo_path, 'files', 'tell-dor', 'tell-dor-loci.csv'
)
# Output path for associations between the files and the loci.
path_files_contexts = os.path.join(
    repo_path, 'files', 'tell-dor', 'tell-dor-files-contexts.csv'
)


# Read the file - locus index supplied by the Tell Dor team.
fl_df = pd.read_csv(path_files_locus_index)

# Read the file metadata CSV into dataframe f_df.
f_df = pd.read_csv(path_files)

# Read the locus (and wall) CSV into dataframe l_df.
l_df = pd.read_csv(path_loci)

fl_df['Locus_Wall'] = fl_df['Locus_Wall'].astype(str) 
fl_df['Locus ID'] = np.nan
for i, row in fl_df.iterrows():
    wall_id = 'Wall ' + row['Locus_Wall']
    locus_id = 'Locus ' + row['Locus_Wall']
    print('Look for {} or {}'.format(wall_id, locus_id))
    id_indx = ((l_df['Locus ID']==wall_id)|(l_df['Locus ID']==locus_id))
    if l_df[id_indx].empty:
        continue
    up_indx = (fl_df['Locus_Wall'] == row['Locus_Wall'])
    fl_df.loc[up_indx, 'Locus ID'] = l_df[id_indx]['Locus ID'].iloc[0]
    print('Update {} with {}'.format(row['Locus_Wall'], l_df[id_indx]['Locus ID'].iloc[0]))

fl_df.to_csv(path_files_locus_index, index=False)



# Set up a dict for File and Locus (and Wall) associations.
file_locus_data = {
    'File ID':[], 
    'Locus ID': [],
}

# Set up a dict for File and Area associations.
# NOTE: An "Area" is an aggregation of multiple squares in the locus/wall
# datafile. Eric grouped these to make search / browsing easier. They
# don't really have any purpose or value for interpretation.
file_square_data = {
    'File ID':[], 
    'Area': [],
}


def add_to_file_context_data(
    file_ids, 
    context_ids,  
    data,
    context_id_col='Locus ID'
):
    """Adds records of file and context associations to a data dict"""
    if not isinstance(context_ids, list):
        context_ids = [context_ids]
    # Get the cross product of all the file_ids and the
    # context_ids
    crossprod = list(itertools.product(file_ids, context_ids))
    data['File ID'] += [c[0] for c in crossprod]
    data[context_id_col] += [c[1] for c in crossprod]
    return data









    



Look for Wall 18839 or Locus 18839
Look for Wall 18229 or Locus 18229






    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
c:\python-3-7-4\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2892             try:
-> 2893                 return self._engine.get_loc(key)
   2894             except KeyError:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: False

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-1-10c52d29d97c> in <module>
     46         continue
     47     up_indx = (fl_df['Locus_Wall'] == row['Locus_Wall'])
---> 48     fl_df.loc[up_indx, 'Locus ID'] = l_df[id_indx[0]]['Locus ID']
     49     print('Update {} with {}'.format(row['Locus_Wall'], l_df[id_indx[0]]['Locus ID']))
     50 

c:\python-3-7-4\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2986             if self.columns.nlevels > 1:
   2987                 return self._getitem_multilevel(key)
-> 2988             indexer = self.columns.get_loc(key)
   2989             if is_integer(indexer):
   2990                 indexer = [indexer]

c:\python-3-7-4\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2893                 return self._engine.get_loc(key)
   2894             except KeyError:
-> 2895                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2896         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2897         if indexer.ndim > 1 or indexer.size > 1:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: False



In [ ]:

    
f_df.head(3)



In [ ]:

    
l_df.head(3)



In [ ]:

    
# Find matching Loci (including Wall Loci) by matching their IDs
# with text in the file metadata 'Caption' column.
for locus_wall_id in l_df['Locus ID'].unique().tolist():
    l_w_id = locus_wall_id.replace('Locus ', 'L').replace('Wall ', 'W')
    
    # l_w_mum_id is for locus or wall IDs that are long unlikely to be
    # a false positive, and lack a "L" or "W" in the caption.
    l_w_num_id = l_w_id.replace('L', ' ').replace('W', ' ')
    if len(l_w_num_id) >= 6:
        # Catch cases where the Locus / Wall ID is long like 
        # '18347'.
        l_w_indx = (
            f_df['Caption'].str.contains(l_w_id)
            | f_df['Caption'].str.contains(l_w_num_id)
        )
    else:
        # The locus / wall id is too short to trust without a 
        # "L" or "W" prefix.
        l_w_indx = f_df['Caption'].str.contains(l_w_id)
    
    if f_df[l_w_indx].empty:
        # We didn't find a match, so continue.
        continue
    print('Found: {} for {} as {}'.format(
            len(f_df[l_w_indx]), 
            locus_wall_id,
            l_w_id,
        )
    )
    file_ids = f_df[l_w_indx]['File ID'].unique().tolist()
    file_locus_data = add_to_file_context_data(
        file_ids, 
        locus_wall_id, 
        file_locus_data
    )

# Now make a dataframe of the file - locus associations
file_locus_df = pd.DataFrame(data=file_locus_data)
print('File and Locus Associations (Found: {})'.format(
    len(file_locus_df.index))
)



In [ ]:

    
# Find matching Loci (including Wall Loci) by matching their Squares
# with text in the file metadata 'Caption' column.
l_df_sq = l_df[~l_df['Square'].isnull()]
for square in l_df_sq['Square'].astype(str).unique().tolist():
    sq_indx = f_df['Caption'].str.contains(square)
    if len(square) < 3 or f_df[sq_indx].empty:
        # Not enough characters for secure match.
        continue
    # Get all file_ids that have his square in their captions
    file_ids = f_df[sq_indx]['File ID'].unique().tolist()
    # Get all the locus ids that are associated with this square
    area_ids = l_df[
        l_df['Square']==square
    ]['Area'].unique().tolist()
    print('Found: {} files with square {} and {} areas'.format(
            len(f_df[sq_indx]), 
            square,
            len(area_ids)
        )
    )
    # Now add to the file_locus_data.
    file_square_data = add_to_file_context_data(
        file_ids, 
        area_ids, 
        file_square_data,
        context_id_col='Area'
    )

# Now make a dataframe of the file - area associations
file_area_df = pd.DataFrame(data=file_square_data)
print('File and Area Associations (Found: {})'.format(
    len(file_area_df.index))
)



In [ ]:

    
context_df = pd.merge(file_locus_df, file_area_df, on='File ID', how='outer')
context_linked_files = context_df['File ID'].unique().tolist()
print('Found File and Context Associations for {} unique files (total rows: {})'.format(
    len(context_linked_files),
    len(context_df.index))
)


# Get a list of files that do NOT have context associations
no_context_files = f_df[
    ~f_df['File ID'].isin(context_linked_files)
]['File ID'].unique().tolist()

file_site_data = {
    'File ID':[], 
    'Site Area': [],
}
file_site_data = add_to_file_context_data(
    no_context_files, 
    'Area G', 
    file_site_data,
    context_id_col='Site Area'
)
site_df = pd.DataFrame(data=file_site_data)
context_df = pd.concat([context_df, site_df], sort=False)

# Set the column order for nice aesthetics
context_df = context_df[['File ID', 'Site Area', 'Area', 'Locus ID']]
context_df.sort_values(by=['File ID', 'Locus ID', 'Area'], inplace=True)

context_df.to_csv(path_files_contexts, index=False)
context_df.head(3)