In [2]:
# Render our plots inline
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (16, 6)
Please specify the data directories.
embem_data_dir
is where the Embodied Emotions data is stored. data_dir
is where data generated with notebook 03_AnalysisCorpora was saved
In [3]:
embem_data_dir = '/home/jvdzwaan/data/embem/'
data_dir = '/home/jvdzwaan/data/tmp/'
In [5]:
# load and combine data (and save it again)
import pandas as pd
import os
# metadata
annotation = pd.read_csv(os.path.join(data_dir, 'annotation.csv'), index_col=0)
corpus_big = pd.read_csv(os.path.join(data_dir, 'corpus_big.csv'), index_col=0)
ceneton = pd.read_csv(os.path.join(data_dir, 'ceneton.csv'), index_col=0)
edbo = pd.read_csv(os.path.join(data_dir, 'edbo.csv'), index_col=0)
combined = pd.read_csv(os.path.join(data_dir, 'combined.csv'), index_col=0)
complete = pd.read_csv(os.path.join(data_dir, 'complete.csv'), index_col=0)
# emotions2bodyparts
em2body_annotation = pd.read_csv(os.path.join(embem_data_dir, 'dict/annotation_emotion2bodyparts.csv'), index_col=0)
em2body_corpus_big = pd.read_csv(os.path.join(embem_data_dir, 'dict/corpus_big_emotion2bodyparts.csv'), index_col=0)
em2body_ceneton = pd.read_csv(os.path.join(embem_data_dir, 'dict/ceneton_emotion2bodyparts.csv'), index_col=0)
em2body_edbo = pd.read_csv(os.path.join(embem_data_dir, 'dict/edbo_emotion2bodyparts.csv'), index_col=0)
em2body_annotation_cl = pd.read_csv(os.path.join(embem_data_dir, 'dict/annotation_emotion2bodyparts_classicism.csv'), index_col=0)
em2body_annotation_en = pd.read_csv(os.path.join(embem_data_dir, 'dict/annotation_emotion2bodyparts_enlightenment.csv'), index_col=0)
em2body_annotation_re = pd.read_csv(os.path.join(embem_data_dir, 'dict/annotation_emotion2bodyparts_renaissance.csv'), index_col=0)
em2body_corpus_big_cl = pd.read_csv(os.path.join(embem_data_dir, 'dict/corpus_big_emotion2bodyparts_classicism.csv'), index_col=0)
em2body_corpus_big_en = pd.read_csv(os.path.join(embem_data_dir, 'dict/corpus_big_emotion2bodyparts_enlightenment.csv'), index_col=0)
em2body_corpus_big_re = pd.read_csv(os.path.join(embem_data_dir, 'dict/corpus_big_emotion2bodyparts_renaissance.csv'), index_col=0)
em2body_ceneton_en = pd.read_csv(os.path.join(embem_data_dir, 'dict/ceneton_emotion2bodyparts_enlightenment.csv'), index_col=0)
em2body_edbo_en = pd.read_csv(os.path.join(embem_data_dir, 'dict/edbo_emotion2bodyparts_enlightenment.csv'), index_col=0)
# combine datasets
em2body_complete = em2body_annotation.add(em2body_corpus_big.add(em2body_ceneton.add(em2body_edbo, fill_value=0), fill_value=0), fill_value=0)
em2body_combined = em2body_corpus_big.add(em2body_ceneton.add(em2body_edbo, fill_value=0), fill_value=0)
em2body_complete_re = em2body_annotation_re.add(em2body_corpus_big_re, fill_value=0)
em2body_combined_re = em2body_corpus_big_re
em2body_complete_cl = em2body_annotation_cl.add(em2body_corpus_big_cl, fill_value=0)
em2body_combined_cl = em2body_corpus_big_cl
em2body_complete_en = em2body_annotation_en.add(em2body_corpus_big_en.add(em2body_ceneton_en.add(em2body_edbo_en, fill_value=0), fill_value=0), fill_value=0)
em2body_combined_en = em2body_corpus_big_en.add(em2body_ceneton_en.add(em2body_edbo_en, fill_value=0), fill_value=0)
# add normalizing constant (sum of pairs per emotion label)
em2body_complete.loc[:, 'total'] = em2body_complete.sum(axis=1)
em2body_combined.loc[:, 'total'] = em2body_combined.sum(axis=1)
em2body_complete_re.loc[:, 'total'] = em2body_complete_re.sum(axis=1)
em2body_combined_re.loc[:, 'total'] = em2body_combined_re.sum(axis=1)
em2body_complete_cl.loc[:, 'total'] = em2body_complete_cl.sum(axis=1)
em2body_combined_cl.loc[:, 'total'] = em2body_combined_cl.sum(axis=1)
em2body_complete_en.loc[:, 'total'] = em2body_complete_en.sum(axis=1)
em2body_combined_en.loc[:, 'total'] = em2body_combined_en.sum(axis=1)
# save combined datasets
em2body_complete.to_csv(os.path.join(data_dir, 'em2body_complete.csv'))
em2body_combined.to_csv(os.path.join(data_dir, 'em2body_combined.csv'))
em2body_complete_re.to_csv(os.path.join(data_dir, 'em2body_complete_re.csv'))
em2body_combined_re.to_csv(os.path.join(data_dir, 'em2body_combined_re.csv'))
em2body_complete_cl.to_csv(os.path.join(data_dir, 'em2body_complete_cl.csv'))
em2body_combined_cl.to_csv(os.path.join(data_dir, 'em2body_combined_cl.csv'))
em2body_complete_en.to_csv(os.path.join(data_dir, 'em2body_complete_en.csv'))
em2body_combined_en.to_csv(os.path.join(data_dir, 'em2body_combined_cl.csv'))
In [6]:
complete.loc[:, 'frac_embodied'] = complete.apply(lambda row: (row['#body_parts']+0.0)/row['#emotional'], axis=1)
combined.loc[:, 'frac_embodied'] = combined.apply(lambda row: (row['#body_parts']+0.0)/row['#emotional'], axis=1)
annotation.loc[:, 'frac_embodied'] = annotation.apply(lambda row: (row['#body_parts']+0.0)/row['#emotional'], axis=1)
corpus_big.loc[:, 'frac_embodied'] = corpus_big.apply(lambda row: (row['#body_parts']+0.0)/row['#emotional'], axis=1)
ceneton.loc[:, 'frac_embodied'] = ceneton.apply(lambda row: (row['#body_parts']+0.0)/row['#emotional'], axis=1)
edbo.loc[:, 'frac_embodied'] = edbo.apply(lambda row: (row['#body_parts']+0.0)/row['#emotional'], axis=1)
data = [complete['frac_embodied'], combined['frac_embodied'], annotation['frac_embodied'], corpus_big['frac_embodied'], ceneton['frac_embodied'], edbo['frac_embodied']]
plt.boxplot(data)
plt.xticks([1,2,3,4,5,6],['Complete', 'Combined', 'Annotation', 'Corpus big','Ceneton','EDBO'])
plt.title('Fraction of emotional sentences assigned the label ``Lichaamsdeel" in the different datasets');
In [7]:
complete.loc[:, 'frac_embodied_bp'] = complete.apply(lambda row: (row['#replaced']+0.0)/row['#emotional'], axis=1)
combined.loc[:, 'frac_embodied_bp'] = combined.apply(lambda row: (row['#replaced']+0.0)/row['#emotional'], axis=1)
annotation.loc[:, 'frac_embodied_bp'] = annotation.apply(lambda row: (row['#replaced']+0.0)/row['#emotional'], axis=1)
corpus_big.loc[:, 'frac_embodied_bp'] = corpus_big.apply(lambda row: (row['#replaced']+0.0)/row['#emotional'], axis=1)
ceneton.loc[:, 'frac_embodied_bp'] = ceneton.apply(lambda row: (row['#replaced']+0.0)/row['#emotional'], axis=1)
edbo.loc[:, 'frac_embodied_bp'] = edbo.apply(lambda row: (row['#replaced']+0.0)/row['#emotional'], axis=1)
data = [complete['frac_embodied_bp'], combined['frac_embodied_bp'], annotation['frac_embodied_bp'], corpus_big['frac_embodied_bp'], ceneton['frac_embodied_bp'], edbo['frac_embodied_bp']]
plt.boxplot(data)
plt.xticks([1,2,3,4,5,6],['Complete', 'Combined', 'Annotation', 'Corpus big','Ceneton','EDBO'])
plt.title('Fraction of emotional sentences assigned the label ``Lichaamsdeel" containing a known body part in the different datasets');
In [8]:
import numpy as np
complete.loc[:, 'frac_bp_found'] = complete.apply(lambda row: np.float64(row['#replaced'])/row['#body_parts'], axis=1)
combined.loc[:, 'frac_bp_found'] = combined.apply(lambda row: np.float64(row['#replaced'])/row['#body_parts'], axis=1)
annotation.loc[:, 'frac_bp_found'] = annotation.apply(lambda row: np.float64(row['#replaced'])/row['#body_parts'], axis=1)
corpus_big.loc[:, 'frac_bp_found'] = corpus_big.apply(lambda row: np.float64(row['#replaced'])/row['#body_parts'], axis=1)
ceneton.loc[:, 'frac_bp_found'] = ceneton.apply(lambda row: np.float64(row['#replaced'])/row['#body_parts'], axis=1)
edbo.loc[:, 'frac_bp_found'] = edbo.apply(lambda row: np.float64(row['#replaced'])/row['#body_parts'], axis=1)
complete = complete.replace([np.inf, -np.inf], 0)
combined = combined.replace([np.inf, -np.inf], 0)
annotation = annotation.replace([np.inf, -np.inf], 0)
corpus_big = corpus_big.replace([np.inf, -np.inf], 0)
ceneton = ceneton.replace([np.inf, -np.inf], 0)
edbo = edbo.replace([np.inf, -np.inf], 0)
data = [complete['frac_bp_found'], combined['frac_bp_found'], annotation['frac_bp_found'], corpus_big['frac_bp_found'], ceneton['frac_bp_found'], edbo['frac_bp_found']]
plt.boxplot(data)
plt.xticks([1,2,3,4,5,6],['Complete', 'Combined', 'Annotation', 'Corpus big','Ceneton','EDBO'])
plt.title('Fraction of sentences assigned the label ``Lichaamsdeel" and containing a known body part in the different datasets');
In [9]:
def get_percentage_recognized_body_parts(df):
return df['frac_bp_found'].mean()
print 'Percentages of recognized body parts'
print 'Complete\t', get_percentage_recognized_body_parts(complete)
print 'Combined\t', get_percentage_recognized_body_parts(combined)
print 'Annotation\t', get_percentage_recognized_body_parts(annotation)
print 'Corpus big\t', get_percentage_recognized_body_parts(corpus_big)
print 'Ceneton\t\t', get_percentage_recognized_body_parts(ceneton)
print 'EDBO\t\t', get_percentage_recognized_body_parts(edbo)
In [10]:
# Fraction of emotional sentences containing body parts (=fraction of emotional sentences containing embodied emotions)
# per time period
def get_embodied_per_period(df, name):
df = df.copy()
df.loc[:, name] = df['#body_parts']/df['#emotional']
df = df.groupby('period').mean()
df = df.reindex(['renaissance', 'classicism', 'enlightenment'])
return df
#print get_embodied_per_period(complete, 'Complete')
df = pd.concat([get_embodied_per_period(complete, 'Complete'), get_embodied_per_period(combined, 'Combined'), get_embodied_per_period(annotation, 'Annotation')], axis=1)
df[['Complete', 'Combined', 'Annotation']].plot();
print df[['Complete', 'Combined', 'Annotation']]
In [11]:
from scipy import stats
import statsmodels.api as sm
def do_anova(df):
f_val, p_val = stats.f_oneway(df.groupby('period').get_group('renaissance')['frac_embodied'], df.groupby('period').get_group('classicism')['frac_embodied'], df.groupby('period').get_group('enlightenment')['frac_embodied'])
print "P value ANOVA: {:10.10f}\n".format(p_val)
if p_val < 0.05:
result = sm.stats.multicomp.pairwise_tukeyhsd(df.frac_embodied, df.period)
print(result.summary())
print 'Complete'
do_anova(complete)
print
print 'Combined'
do_anova(combined)
print
print 'Annotation'
do_anova(annotation)
Conclusie: Het aandeel embodied emotions in het totaal van gevonden emoties neemt significant af van renaissance naar classicism. Het verschil tussen enlightenment en renaissance is ook significant. Het verschil tussen classicism en enlightenment niet. Dit geldt voor de datasets complete en combined.
Opvallend: dit geldt niet voor de annotatieset, hoewel de verschillen groter lijken. Maar dat komt dan waarschijnlijk doordat er relatief weinig stukken in de annotatieset zitten.
Dus: we vinden relatief minder embodied emotions in classicism en enlightenment vergeleken met renaissance.
Ook vind ik het percentage emoties dat embodied is best wel laag (<25%). Opvallend detail.
In [12]:
# normalize data
em2body_complete = em2body_complete.div(em2body_complete.total, axis='index')
em2body_combined = em2body_complete.div(em2body_combined.total, axis='index')
em2body_complete_re = em2body_complete_re.div(em2body_complete_re.total, axis='index')
em2body_combined_re = em2body_complete_re.div(em2body_combined_re.total, axis='index')
em2body_complete_cl = em2body_complete_cl.div(em2body_complete_cl.total, axis='index')
em2body_combined_cl = em2body_complete_cl.div(em2body_combined_cl.total, axis='index')
em2body_complete_en = em2body_complete_en.div(em2body_complete_en.total, axis='index')
em2body_combined_en = em2body_complete_en.div(em2body_combined_en.total, axis='index')
# remove normalizing constant (no need to plot that)
em2body_complete.drop('total', axis=1, inplace=True)
em2body_combined.drop('total', axis=1, inplace=True)
em2body_complete_re.drop('total', axis=1, inplace=True)
em2body_combined_re.drop('total', axis=1, inplace=True)
em2body_complete_cl.drop('total', axis=1, inplace=True)
em2body_combined_cl.drop('total', axis=1, inplace=True)
em2body_complete_en.drop('total', axis=1, inplace=True)
em2body_combined_en.drop('total', axis=1, inplace=True)
In [13]:
em2body_complete
Out[13]:
In [14]:
#def
liefde = em2body_complete_re.loc['Liefde'].to_frame(name='renaissance')
liefde['classicism'] = em2body_complete_cl.loc['Liefde']
liefde['enlightenment'] = em2body_complete_en.loc['Liefde']
liefde.plot(kind='bar')
Out[14]: