In [ ]:
import pandas as pd
import bisect
import numpy as np
import pickle
import csv

Version submitted to ChaLearn Qualitative Stage

Helper functions


In [ ]:
def get_percentile(current_observation, distribution_df):
    data_distribution = np.array(distribution_df)
    return float(bisect.bisect(data_distribution, float(current_observation))) / float(len(data_distribution))

In [ ]:
def get_strongest_factors(feature_influence_scores, feature_category, variable='interview'):
    strongest_factors = {}
    
    relevant_feature_influence_scores = feature_influence_scores[feature_category][variable]
    for scores in relevant_feature_influence_scores:
        strongest_factors[scores[2]] = scores[0] * scores[1]
    
    return strongest_factors

In [ ]:
def mention_if_strong_factor(feature_key, strongest_factors):
    factor_assessment = ''
    
    if feature_key in strongest_factors:
        contribution = strongest_factors[feature_key]
        
        if contribution > 0:
            factor_assessment = 'In our model, a higher score on this feature typically leads to a higher overall assessment score.'
        elif contribution < 0:
            factor_assessment = 'In our model, a higher score on this feature typically leads to a lower overall assessment score.'

    if len(factor_assessment) > 0:
        factor_assessment += '\n'
    
    return factor_assessment

In [ ]:
def describe_individual_feature(scores, training_data, strongest_factors, feature_key, description_function, feature_human_explanations):
    feature_score = scores[feature_key]
    feature_training_data = training_data[feature_key]
    
    feature_percentile = get_percentile(feature_score, feature_training_data)
    
    feature_description = '*** %s ***\n' % feature_human_explanations[feature_key]
    feature_description += 'This feature typically ranges between %f and %f. ' % (np.min(feature_training_data), np.max(feature_training_data))
    feature_description += 'The score for this video is %f (percentile: %d).\n' %(feature_score, 100 * feature_percentile)
    
    feature_description += mention_if_strong_factor(feature_key, strongest_factors)
    
    feature_description += description_function(feature_score, feature_percentile)
    
    feature_description += '\n'
    return feature_description

In [ ]:
def describe_entropy(score, percentile):
    entropy_description = ''
    
    if score < 1.0:
        entropy_description += "I have some problems detecting the person's face. Probably the person is not always looking into the camera, or the video is of low quality.\n"
    elif score > 2.0:
        entropy_description += "It looks like the person is consistently facing the camera.\n"

    return entropy_description

In [ ]:
def describe_mei_median(score, percentile):
    mei_median_description = ''
    
    if percentile < 0.25:
        mei_median_description += 'When taking the median of the degree of movement, this person moves a lot.\n'
    elif percentile > 0.75:
        mei_median_description += 'When taking the median of the degree of movement, this person moves quite a lot.\n'

    return mei_median_description

In [ ]:
def describe_mei_mean(score, percentile):
    mei_mean_description = ''
    
    if percentile < 0.25:
        mei_mean_description += 'When averaging the degree of movement, this person moves a lot.\n'
    elif percentile > 0.75:
        mei_mean_description += 'When averaging the degree of movement, this person moves quite a lot.\n'

    return mei_mean_description

In [ ]:
def describe_nothing(score, percentile):
    return ''

In [ ]:
def describe_mei(mei_scores, training_data, feature_influence, feature_human_explanations):
    mei_description = '** FEATURES OBTAINED FROM MOTION ENERGY ANALYSIS **\n'
    mei_description += 'I focused on the person\'s face and verified how much movement was detected over time.\n\n'
    strongest_factors = get_strongest_factors(feature_influence, 'mei')
    
    mei_description += describe_individual_feature(mei_scores, training_data, strongest_factors, 'Entropy', describe_entropy, feature_human_explanations)
    mei_description += describe_individual_feature(mei_scores, training_data, strongest_factors, 'Median', describe_mei_median, feature_human_explanations)
    mei_description += describe_individual_feature(mei_scores, training_data, strongest_factors, 'Mean', describe_mei_mean, feature_human_explanations)
    
    return mei_description

In [ ]:
def describe_openface(openface_scores, training_data, feature_influence, feature_human_explanations):
    openface_description = '** FEATURES OBTAINED FROM FACIAL ACTION UNIT ANALYSIS **\n'
    openface_description += 'I focused on Action Units in the person\'s face: activity of dedicated face muscles.\n'
    openface_description += 'These values may say something about how expressive the person is.\n\n'
    strongest_factors = get_strongest_factors(feature_influence, 'openface')
    
    openface_description += 'FEATURES FROM THE EYES\n'
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU01_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU01_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU01_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU02_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU02_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU02_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU04_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU04_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU04_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU05_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU05_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU05_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU07_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU07_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU07_r)', describe_nothing, feature_human_explanations)    
    
    openface_description += '\n'
    
    openface_description += 'FEATURES FROM THE MOUTH\n'
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU10_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU10_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU10_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU12_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU12_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU12_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU15_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU15_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU15_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU20_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU20_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU20_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU23_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU23_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU23_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU25_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU25_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU25_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU28_c)', describe_nothing, feature_human_explanations)
    
    openface_description += '\n'
    openface_description += 'FEATURES FROM THE CHIN\n'
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU17_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU17_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU17_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU14_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU14_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU14_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU26_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU26_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU26_r)', describe_nothing, feature_human_explanations)    

    
    openface_description += '\n'
    openface_description += 'FEATURES FROM OTHER AREAS\n'
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU06_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU06_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU06_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU09_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU09_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU09_r)', describe_nothing, feature_human_explanations)    
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, '% presence (AU45_c)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'max intensity  (AU45_r)', describe_nothing, feature_human_explanations)
    openface_description += describe_individual_feature(openface_scores, training_data, strongest_factors, 'mean intensity (AU45_r)', describe_nothing, feature_human_explanations)    

    return openface_description

In [ ]:
def describe_visual_aspects(mei_scores, openface_scores, training_data, feature_influence, feature_human_explanations):
    
    visual_report = '*******************\n* VISUAL FEATURES *\n*******************\n'
    
    visual_report += 'Here is the report on what I could \'see\':\n\n'
    
    mei_entropy = mei_scores['Entropy']
    
    # Catch for detection bug
    if mei_entropy < 0.5:
        visual_report += '...oops. It looks like the person had posters in the background, and I actually focused on a poster instead of the person. Nothing more to assess visually...'
        return visual_report
    else:
        visual_report += describe_mei(mei_scores, training_data, feature_influence, feature_human_explanations)
    
    visual_report += '---------------------------------------------------\n\n'
    
    visual_report += describe_openface(openface_scores, training_data, feature_influence, feature_human_explanations)
    visual_report += '\n'
    
    return visual_report

In [ ]:
def describe_word_count(score, percentile):
    word_count_description = ''
    
    if percentile < 0.25:
        word_count_description = 'This person does not speak a lot.'
    elif percentile > 0.75:
        word_count_description = 'This person speaks a lot.'
    
    if len(word_count_description) > 0:
        word_count_description += '\n'

    return word_count_description

In [ ]:
def describe_unique_words(score, percentile):
    unique_word_description = ''
    
    if percentile < 0.25:
        unique_word_description = 'This person has an unusually large vocabulary.'
    elif percentile > 0.75:
        unique_word_description = 'This person has an unusually small vocabulary.'
    
    if len(unique_word_description) > 0:
        unique_word_description += '\n'

    return unique_word_description

In [ ]:
def describe_txt(txt_scores, training_data, feature_influence, feature_human_explanations):
    txt_description = '** FEATURES OBTAINED FROM SIMPLE TEXT ANALYSIS **\n'
    txt_description += 'Cognitive capability may be important for the job. I looked at a few very simple text statistics first.\n\n'
    strongest_factors = get_strongest_factors(feature_influence, 'txt')
    
    txt_description += describe_individual_feature(txt_scores, training_data, strongest_factors, 'word_count', describe_word_count, feature_human_explanations)
    txt_description += describe_individual_feature(txt_scores, training_data, strongest_factors, 'word_count_unique', describe_unique_words, feature_human_explanations)
    
    return txt_description

In [ ]:
def describe_ari(score, percentile):
    # mapping taken from https://en.wikipedia.org/wiki/Automated_readability_index
    score = np.ceil(score)
    if score <= 1:
        score_interpretation = 'Kindergarten'
    elif score == 2:
        score_interpretation = 'First Grade'
    elif score == 3:
        score_interpretation = 'Second Grade'
    elif score == 4:
        score_interpretation = 'Third Grade'
    elif score == 5:
        score_interpretation = 'Fourth Grade'
    elif score == 6:
        score_interpretation = 'Fifth Grade'
    elif score == 7:
        score_interpretation = 'Sixth Grade'
    elif score == 8:
        score_interpretation = 'Seventh Grade'
    elif score == 9:
        score_interpretation = 'Eighth Grade'
    elif score == 10:
        score_interpretation = 'Ninth Grade'
    elif score == 11:
        score_interpretation = 'Tenth Grade'
    elif score == 12:
        score_interpretation = 'Eleventh Grade'
    elif score == 13:
        score_interpretation = 'Twelfth Grade'
    else:
        score_interpretation = 'College'

    ari_description = 'According to the ARI score, the estimated educational level needed to understand this person is %s.\n' % score_interpretation
    
    return ari_description

In [ ]:
def describe_gunning_fog(score, percentile):
    # mapping taken from https://en.wikipedia.org/wiki/Gunning_fog_index
    
    score = np.ceil(score)
    if score < 6:
        score_interpretation = 'Under Sixth Grade'
    elif score == 6:
        score_interpretation = 'Sixth Grade'
    elif score == 7:
        score_interpretation = 'Seventh Grade'
    elif score == 8:
        score_interpretation = 'Eighth Grade'
    elif score == 9:
        score_interpretation = 'High School Freshman'
    elif score == 10:
        score_interpretation = 'High School Sophomore'
    elif score == 11:
        score_interpretation = 'High School Junior'
    elif score == 12:
        score_interpretation = 'High School Senior'
    elif score == 13:
        score_interpretation = 'College Freshman'
    elif score == 14:
        score_interpretation = 'College Sophomore'
    elif score == 16:
        score_interpretation = 'College Senior'
    else:
        score_interpretation = 'College Graduate'
    
    gunning_fog_description = 'According to the Gunning Fog Index, the estimated educational level needed to understand this person is %s.\n' % score_interpretation
    
    return gunning_fog_description

In [ ]:
def describe_flesch_kincaid_ease(score, percentile):
    # mapping taken from https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
    
    if score > 90:
        score_interpretation = 'very easy. Easily understood by an average 11-year-old student.'
    elif score > 80:
        score_interpretation = 'easy. Conversational English for consumers.'
    elif score > 70:
        score_interpretation = 'fairly easy to understand.'
    elif score > 60:
        score_interpretation = 'plain English. Easily understood by 13- to 15-year-old students.'
    elif score > 50:
        score_interpretation = 'fairly difficult.'
    elif score > 30:
        score_interpretation = 'difficult.'
    else:
        score_interpretation = 'very difficult to understand. Best understood by university graduates.'
    
    flesch_kincaid_ease_assessment = 'According to the Flesch-Kincaid reading ease score, this person\'s text is %s' % score_interpretation
    
    return flesch_kincaid_ease_assessment

In [ ]:
def describe_lix(score, percentile):
    # mapping taken from (Anderson, 1983)
    
    score = np.ceil(score)

    score_interpretation = ''
    if score < 10:
        score_interpretation = 'First Grade'
    elif score <= 14:
        score_interpretation = 'Second Grade'
    elif score <= 19:
        score_interpretation = 'Third Grade'
    elif score <= 23:
        score_interpretation = 'Fourth Grade'
    elif score <= 27:
        score_interpretation = 'Fifth Grade'
    elif score <= 31:
        score_interpretation = 'Sixth Grade'
    elif score <= 35:
        score_interpretation = 'Seventh Grade'
    elif score <= 39:
        score_interpretation = 'Eighth Grade'
    elif score <= 43:
        score_interpretation = 'Ninth Grade'
    elif score <= 47:
        score_interpretation = 'Tenth Grade'
    elif score <= 51:
        score_interpretation = 'Eleventh Grade'
    elif score <= 55:
        score_interpretation = 'Twelfth Grade'
    else:
        score_interpretation = 'College'

    lix_assessment = 'According to the LIX score, the estimated educational level needed to understand this person is %s.\n' % score_interpretation
    return lix_assessment

In [ ]:
def describe_rix(score, percentile):
    # mapping taken from (Anderson, 1983)
    
    score_interpretation = ''
    if score < 0.2:
        score_interpretation = 'First Grade'
    elif score < 0.5:
        score_interpretation = 'Second Grade'
    elif score < 0.8:
        score_interpretation = 'Third Grade'
    elif score < 1.3:
        score_interpretation = 'Fourth Grade'
    elif score < 1.8:
        score_interpretation = 'Fifth Grade'
    elif score < 2.4:
        score_interpretation = 'Sixth Grade'
    elif score < 3.0:
        score_interpretation = 'Seventh Grade'
    elif score < 3.7:
        score_interpretation = 'Eighth Grade'
    elif score < 4.5:
        score_interpretation = 'Ninth Grade'
    elif score < 5.3:
        score_interpretation = 'Tenth Grade'
    elif score < 6.2:
        score_interpretation = 'Eleventh Grade'
    elif score < 7.2:
        score_interpretation = 'Twelfth Grade'
    else:
        score_interpretation = 'College'

    rix_assessment = 'According to the RIX score, the estimated educational level needed to understand this person is %s.\n' % score_interpretation
    return rix_assessment

In [ ]:
def describe_readability(readability_scores, training_data, feature_influence, feature_human_explanations):
    readability_description = '** FEATURES OBTAINED FROM READABILITY ANALYSIS **\n'
    readability_description += 'As slightly more sophisticated measure, I ran several readability metrics.\n'
    readability_description += 'Note that several of these were originally designed for larger, written texts. This is why metrics may disagree.\n\n'
    strongest_factors = get_strongest_factors(feature_influence, 'readability')
    
    readability_description += describe_individual_feature(readability_scores, training_data, strongest_factors, 'ARI', describe_ari, feature_human_explanations)
    readability_description += describe_individual_feature(readability_scores, training_data, strongest_factors, 'Coleman Liau Index', describe_nothing, feature_human_explanations)
    readability_description += describe_individual_feature(readability_scores, training_data, strongest_factors, 'Flesch-Kincaid Grade Level', describe_nothing, feature_human_explanations)
    readability_description += describe_individual_feature(readability_scores, training_data, strongest_factors, 'SMOG Index', describe_nothing, feature_human_explanations)
    readability_description += describe_individual_feature(readability_scores, training_data, strongest_factors, 'LIX', describe_lix, feature_human_explanations)
    readability_description += describe_individual_feature(readability_scores, training_data, strongest_factors, 'RIX', describe_rix, feature_human_explanations)
    
    readability_description += describe_individual_feature(readability_scores, training_data, strongest_factors, 'Gunning Fog Index', describe_gunning_fog, feature_human_explanations)
    readability_description += describe_individual_feature(readability_scores, training_data, strongest_factors, 'Flesch Reading Ease', describe_flesch_kincaid_ease, feature_human_explanations)

    return readability_description

In [ ]:
def describe_linguistic_use(txt_scores, readability_scores, training_data, feature_influence, feature_human_explanations):
    
    linguistic_report = '*******************\n* USE OF LANGUAGE *\n*******************\n'
    linguistic_report += 'Here is the report on the person\'s language use:\n\n'
    
    linguistic_report += describe_txt(txt_scores, training_data, feature_influence, feature_human_explanations)
    
    linguistic_report += '---------------------------------------------------\n\n'
    
    linguistic_report += describe_readability(readability_scores, training_data, feature_influence, feature_human_explanations)
    

    return linguistic_report

In [ ]:
def assess_video(txt_scores, readability_scores, mei_scores, openface_scores, training_data, feature_influence, feature_human_explanations):
    overall_assessment = ''
    
    overall_assessment += '%s\n' % describe_linguistic_use(txt_scores, readability_scores, training_data, feature_influence, feature_human_explanations)
    overall_assessment += '%s\n' % describe_visual_aspects(mei_scores, openface_scores, training_data, feature_influence, feature_human_explanations)
    
    return overall_assessment

In [ ]:
def feature_csv_to_dict(path):
    feature_dict = {}
    data_reader = csv.DictReader(open(path))
    for entry in data_reader:
        video_id = entry['']
        feature_dict[video_id] = {}
        for key in entry.keys():
            if not key == '':
                feature_dict[video_id][key] = float(entry[key])
    
    return feature_dict

In [ ]:
def generate_qualitative_descriptions(predictions, test_data_txt, test_data_readability, test_data_mei, test_data_openface, train_data, feature_influence, feature_human_explanations):
    descriptions = {}
    for video_id in predictions:
        current_description = '****************************************************\n'
        current_description += '* ASSESSMENT REPORT FOR VIDEO %s: *\n' % video_id
        current_description += '****************************************************\n\n'
        
        current_description += 'On a scale from 0.0 to 1.0, I would rate this person\'s interviewability as %f.\n' % predictions[video_id]
        current_description += 'Below, I will report on linguistic and visual assessment of the person.\nPercentiles are obtained by comparing the person against scores of 6000 earlier assessed people.\n\n'
        
        current_description += '---------------------------------------------------\n\n'
        current_description += assess_video(test_data_txt[video_id], test_data_readability[video_id], test_data_mei[video_id], test_data_openface[video_id], train_data, feature_influence, feature_human_explanations)
        
        descriptions[video_id] = (current_description, None)
    
    return descriptions

Mappings from features to human-understandable descriptions


In [ ]:
# Map of human descriptions
# We do not use the Centers of Mass, as our motion energy images are on segmented faces.
# CoM would likely be a useful features on 'full' videos though.
human_descriptions = \
{'% presence (AU01_c)': 'Action Unit 1: how often was the inner brow raised?',
 '% presence (AU02_c)': 'Action Unit 2: how often was the outer brow raised?',
 '% presence (AU04_c)': 'Action Unit 4: how often was the brow lowered?',
 '% presence (AU05_c)': 'Action Unit 5: how often was the upper lid raised?',
 '% presence (AU06_c)': 'Action Unit 6: how often was the cheek raised?',
 '% presence (AU07_c)': 'Action Unit 7: how often was the eyelid tightened?',
 '% presence (AU09_c)': 'Action Unit 9: how often did the nose wrinkle?',
 '% presence (AU10_c)': 'Action Unit 10: how often was the upper lip raised?',
 '% presence (AU12_c)': 'Action Unit 12: how often was the lip corner pulled?',
 '% presence (AU14_c)': 'Action Unit 14: how often was the dimple present?',
 '% presence (AU15_c)': 'Action Unit 15: how often was the lip corner depressed?',
 '% presence (AU17_c)': 'Action Unit 17: how often was the chin raised?',
 '% presence (AU20_c)': 'Action Unit 20: how often was the lip stretched?',
 '% presence (AU23_c)': 'Action Unit 23: how often was the lip tightened?',
 '% presence (AU25_c)': 'Action Unit 25: how often did the lips part?',
 '% presence (AU26_c)': 'Action Unit 26: how often did the jaw drop?',
 '% presence (AU28_c)': 'Action Unit 28: how often was the lip sucked?',
 '% presence (AU45_c)': 'Action Unit 45: how often did the person blink?',
 'ARI': 'US grade level required for comprehension according to the ARI score',
 'Coleman Liau Index': 'US grade level required for comprehension according to the Coleman Liau score',
 'Entropy': 'Motion energy entropy: how varied is the degree of movement across the person\'s face?',
 'Flesch Reading Ease': 'Reading ease according to the Flesch score',
 'Flesch-Kincaid Grade Level': 'US grade level required for comprehension according to the Flesch-Kincaid score',
 'Gunning Fog Index': 'US grade level required for comprehension according to the Gunning-Fog score',
 #'Horizontal CoM': 'Horizontal center of face',
 'LIX': 'Readability assessment according to the Lix score',
 'Mean': 'Mean motion energy: how much does the person move on average?',
 'Median': 'Median motion energy: what is the typical degree of movement of this person?',
 'RIX': 'Readability assessment according to the RIX score',
 'SMOG Index': 'Years of reading required to understand the text according to the SMOG score',
 #'Vertical CoM': 'Vertical center of face',
 'max intensity  (AU01_r)': 'Action Unit 1: how much was the inner brow raised at most?',
 'max intensity  (AU02_r)': 'Action Unit 2: how much was the outer brow raised at most?',
 'max intensity  (AU04_r)': 'Action Unit 4: how much was the brow lowered at most?',
 'max intensity  (AU05_r)': 'Action Unit 5: how much was the upper lid raised at most?',
 'max intensity  (AU06_r)': 'Action Unit 6: how much was the cheek raised at most?',
 'max intensity  (AU07_r)': 'Action Unit 7: how much was the eyelid tightened at most?',
 'max intensity  (AU09_r)': 'Action Unit 9: how much did the nose wrinkle at most?',
 'max intensity  (AU10_r)': 'Action Unit 10: how much was the upper lip raised at most?',
 'max intensity  (AU12_r)': 'Action Unit 12: how much was the lip corner pulled at most?',
 'max intensity  (AU14_r)': 'Action Unit 14: how much was the dimple present at most?',
 'max intensity  (AU15_r)': 'Action Unit 15: how much was the lip corner depressed at most?',
 'max intensity  (AU17_r)': 'Action Unit 17: how much was the chin raised at most?',
 'max intensity  (AU20_r)': 'Action Unit 20: how much was the lip stretched at most?',
 'max intensity  (AU23_r)': 'Action Unit 23: how much was the lip tightened at most?',
 'max intensity  (AU25_r)': 'Action Unit 25: how much did the lips part at most?',
 'max intensity  (AU26_r)': 'Action Unit 26: how much did the jaw drop at most?',
 'max intensity  (AU28_r)': 'Action Unit 28: how much was the lip sucked at most?',
 'max intensity  (AU45_r)': 'Action Unit 45: how much did the person blink at most?',
 'mean intensity (AU01_r)': 'Action Unit 1: how much was the inner brow raised on average?',
 'mean intensity (AU02_r)': 'Action Unit 2: how much was the outer brow raised on average?',
 'mean intensity (AU04_r)': 'Action Unit 4: how much was the brow lowered on average?',
 'mean intensity (AU05_r)': 'Action Unit 5: how much was the upper lid raised on average?',
 'mean intensity (AU06_r)': 'Action Unit 6: how much was the cheek raised on average?',
 'mean intensity (AU07_r)': 'Action Unit 7: how much was the eyelid tightened on average?',
 'mean intensity (AU09_r)': 'Action Unit 9: how much did the nose wrinkle on average?',
 'mean intensity (AU10_r)': 'Action Unit 10: how much was the upper lip raised on average?',
 'mean intensity (AU12_r)': 'Action Unit 12: how much was the lip corner pulled on average?',
 'mean intensity (AU14_r)': 'Action Unit 14: how much was the dimple present on average?',
 'mean intensity (AU15_r)': 'Action Unit 15: how much was the lip corner depressed on average?',
 'mean intensity (AU17_r)': 'Action Unit 17: how much was the chin raised on average?',
 'mean intensity (AU20_r)': 'Action Unit 20: how much was the lip stretched on average?',
 'mean intensity (AU23_r)': 'Action Unit 23: how much was the lip tightened on average?',
 'mean intensity (AU25_r)': 'Action Unit 25: how much did the lips part on average?',
 'mean intensity (AU26_r)': 'Action Unit 26: how much did the jaw drop on average?',
 'mean intensity (AU28_r)': 'Action Unit 28: how much was the lip sucked on average?',
 'mean intensity (AU45_r)': 'Action Unit 45: how much did the person blink on average?',
 'word_count': 'Amount of spoken words',
 'word_count_unique': 'Amount of unique words'}

Generating the actual descriptions


In [ ]:
# Read training data from combined csv: we will use this as comparison data for assessing percentiles
train_data = pd.read_csv('../feature extraction/features/Train/combined.csv')

In [ ]:
feature_influence = pickle.load(open('../regression/feature_influence.pkl'))

Validation set


In [ ]:
val_data_txt = feature_csv_to_dict('../feature extraction/features/Validation/Txt.csv')
val_data_readability = feature_csv_to_dict('../feature extraction/features/Validation/Readability.csv')
val_data_openface = feature_csv_to_dict('../feature extraction/features/Validation/OpenFace.csv')
val_data_mei = feature_csv_to_dict('../feature extraction/features/Validation/MEI.csv')

In [ ]:
validation_subset = ['cT3oyHhUznw.000.mp4',\
                     'sHVXhr7_EOs.000.mp4',\
                     'ax8wm9K41og.002.mp4',\
                     'B2riMsP8LD8.002.mp4',\
                     'kSk-rf7a1Ig.004.mp4',\
                     'o2wtRccAgjE.005.mp4',\
                     'DVh_7dO2cWY.001.mp4',\
                     '2SzC9dm4Yy4.001.mp4',\
                     '7fOxteINSUg.002.mp4',\
                     'EvZ0esZgPK4.005.mp4']

In [ ]:
all_predictions = pickle.load(open('../submission/Validation/prediction_all.pkl'))

In [ ]:
all_descriptions = generate_qualitative_descriptions(all_predictions['interview'], val_data_txt, val_data_readability, val_data_mei, val_data_openface, train_data, feature_influence, human_descriptions)

In [ ]:
# filter only the validation files of interest
predictions = {}
descriptions = {}
for video_id in all_descriptions:
    if video_id in validation_subset:
        descriptions[video_id] = all_descriptions[video_id]
        
        for variable in all_predictions:
            if not variable in predictions:
                predictions[variable] = {}
            predictions[variable][video_id] = all_predictions[variable][video_id]

In [ ]:
f = open('../submission/Validation/prediction.pkl', 'wb')
pickle.dump(predictions, f)
f.close()
f = open('../submission/Validation/description.pkl', 'wb')
pickle.dump(descriptions, f)
f.close()

Test set


In [ ]:
test_data_txt = feature_csv_to_dict('../feature extraction/features/Test/Txt.csv')
test_data_readability = feature_csv_to_dict('../feature extraction/features/Test/Readability.csv')
test_data_openface = feature_csv_to_dict('../feature extraction/features/Test/OpenFace.csv')
test_data_mei = feature_csv_to_dict('../feature extraction/features/Test/MEI.csv')

In [ ]:
predictions = pickle.load(open('../submission/Test/prediction.pkl'))

In [ ]:
descriptions = generate_qualitative_descriptions(predictions['interview'], test_data_txt, test_data_readability, test_data_mei, test_data_openface, train_data, feature_influence, human_descriptions)

In [ ]:
f = open('../submission/Test/description.pkl', 'wb')
pickle.dump(descriptions, f)
f.close()

In [ ]:
f = open('../submission/Validation/prediction.pkl', 'rb')
prediction_val = pickle.load(f)
f.close()
f = open('../submission/Validation/description.pkl', 'rb')
description_val = pickle.load(f)
f.close()

In [ ]:
prediction_val