In [1]:
# Render our plots inline
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (16, 6)
In [5]:
# adjust to your local directories
embem_data_dir = '/home/jvdzwaan/data/embem/'
output_dir = '/home/jvdzwaan/data/tmp/'
In [8]:
# load data
def load_data(corpus, column_names, corpus_metadata, label_counts, body_parts, emotion_bodypart_pairs):
c = pd.read_csv(corpus, header=None, sep='\t', index_col=0, names=column_names)
md = pd.read_csv(corpus_metadata, index_col=0)
l = pd.read_csv(label_counts, index_col=0)
bp = pd.read_csv(body_parts, index_col=0)
ebp = pd.read_csv(emotion_bodypart_pairs, index_col=0)
return pd.concat([c, md, l, bp, ebp], axis=1)
corpus_big = load_data(os.path.join(embem_data_dir, 'corpus/corpus_big.csv'), ['id', 'year', 'genre', 'title', 'authors'],
os.path.join(embem_data_dir, 'dict/corpus_big_additional_metadata.csv'),
os.path.join(embem_data_dir, 'dict/corpus_big_label_counts.csv'),
os.path.join(embem_data_dir, 'dict/corpus_big_heem_expanded_body_parts.csv'),
os.path.join(embem_data_dir, 'dict/corpus_big_emotion_bodypart_pairs.csv'))
annotation = load_data(os.path.join(embem_data_dir, 'corpus/annotation_corpus.csv'), ['id', 'year', 'genre', 'title', 'authors'],
os.path.join(embem_data_dir, 'dict/annotation_additional_metadata.csv'),
os.path.join(embem_data_dir, 'dict/annotation_label_counts.csv'),
os.path.join(embem_data_dir, 'dict/annotation_heem_expanded_body_parts.csv'),
os.path.join(embem_data_dir, 'dict/annotation_emotion_bodypart_pairs.csv'))
ceneton = load_data(os.path.join(embem_data_dir, 'corpus/ceneton.csv'), ['id', 'year', 'genre', 'title', 'authors'],
os.path.join(embem_data_dir, 'dict/ceneton_additional_metadata.csv'),
os.path.join(embem_data_dir, 'dict/ceneton_label_counts.csv'),
os.path.join(embem_data_dir, 'dict/ceneton_heem_expanded_body_parts.csv'),
os.path.join(embem_data_dir, 'dict/ceneton_emotion_bodypart_pairs.csv'))
edbo = load_data(os.path.join(embem_data_dir, 'corpus/edbo.csv'), ['id', 'year', 'genre', 'title+author'],
os.path.join(embem_data_dir, 'dict/edbo_additional_metadata.csv'),
os.path.join(embem_data_dir, 'dict/edbo_label_counts.csv'),
os.path.join(embem_data_dir, 'dict/edbo_heem_expanded_body_parts.csv'),
os.path.join(embem_data_dir, 'dict/edbo_emotion_bodypart_pairs.csv'))
complete = pd.concat([annotation, corpus_big, ceneton, edbo]).fillna(0)
combined = pd.concat([corpus_big, ceneton, edbo]).fillna(0)
In [9]:
# Basic statistics
print '# texts'
print 'Corpus big:', len(corpus_big)
print 'Annotation:', len(annotation)
print 'Ceneton:', len(ceneton)
print 'EDBO:', len(edbo)
print 'Combined:', len(combined)
print 'Complete:', len(complete)
#combined
In [10]:
# number of texts per genre and period
print 'Number of texts per genre'
genres = complete.groupby('genre')
genres.size().plot(kind='bar')
print genres.size()
In [11]:
print 'Number of texts per period'
periods = complete.groupby('period')
periods.size().reindex(['renaissance', 'classicism', 'enlightenment']).plot(kind='bar')
print periods.size().reindex(['renaissance', 'classicism', 'enlightenment'])
In [12]:
print 'Number of texts per period'
df = pd.DataFrame({'count' : complete.groupby(['period', 'genre']).size()}).reset_index()
df = df.pivot(index='period', columns='genre', values='count')
df = df.fillna(0)
df = df.reindex(['renaissance', 'classicism', 'enlightenment'])
print df
df.plot(kind='bar')
Out[12]:
In [13]:
print 'Number of texts per year'
years = complete.groupby('year')
#print years.size()
print 'Number of years for which 0 texts are available:', np.sum(years.size() == 0)
years.size().plot(marker='o')
Out[13]:
In [14]:
print 'Number of texts per genre per year'
year2genre = pd.DataFrame({'count' : complete.groupby(['year', 'genre']).size()}).reset_index()
year2genre = year2genre.pivot(index='year', columns='genre', values='count')
year2genre = year2genre.fillna(0)
#print year2genre
year2genre.plot()
Out[14]:
In [15]:
complete.loc[:, 'frac_emotional'] = complete.apply(lambda row: (row['#emotional']+0.0)/row['#lines'], axis=1)
combined.loc[:, 'frac_emotional'] = combined.apply(lambda row: (row['#emotional']+0.0)/row['#lines'], axis=1)
annotation.loc[:, 'frac_emotional'] = annotation.apply(lambda row: (row['#emotional']+0.0)/row['#lines'], axis=1)
corpus_big.loc[:, 'frac_emotional'] = corpus_big.apply(lambda row: (row['#emotional']+0.0)/row['#lines'], axis=1)
ceneton.loc[:, 'frac_emotional'] = ceneton.apply(lambda row: (row['#emotional']+0.0)/row['#lines'], axis=1)
edbo.loc[:, 'frac_emotional'] = edbo.apply(lambda row: (row['#emotional']+0.0)/row['#lines'], axis=1)
data = [complete['frac_emotional'], combined['frac_emotional'], annotation['frac_emotional'], corpus_big['frac_emotional'], ceneton['frac_emotional'], edbo['frac_emotional']]
plt.boxplot(data)
plt.xticks([1,2,3,4,5,6],['Complete', 'Combined', 'Annotation', 'Corpus big','Ceneton','EDBO'])
plt.title('Fraction of emotional sentences in the different datasets');
In [16]:
from scipy import stats
import statsmodels.api as sm
f_val, p_val = stats.f_oneway(annotation['frac_emotional'], corpus_big['frac_emotional'], ceneton['frac_emotional'], edbo['frac_emotional'])
print "P value ANOVA: {:10.10f}\n".format(p_val)
annotation.loc[:, 'corpus'] = annotation.apply(lambda row: 'annotation', axis=1)
corpus_big.loc[:, 'corpus'] = corpus_big.apply(lambda row: 'corpus_big', axis=1)
ceneton.loc[:, 'corpus'] = ceneton.apply(lambda row: 'ceneton', axis=1)
edbo.loc[:, 'corpus'] = edbo.apply(lambda row: 'edbo', axis=1)
df = pd.concat([annotation, corpus_big, ceneton, edbo])
result = sm.stats.multicomp.pairwise_tukeyhsd(df.frac_emotional, df.corpus)
print(result.summary())
In [17]:
data = [complete['#lines'], combined['#lines'], annotation['#lines'], corpus_big['#lines'], ceneton['#lines'], edbo['#lines']]
plt.boxplot(data)
plt.xticks([1,2,3,4,5,6],['Complete', 'Combined', 'Annotation', 'Corpus big','Ceneton','EDBO'])
plt.title('The number of lines per text in different datasets');
In [18]:
f_val, p_val = stats.f_oneway(annotation['#lines'], corpus_big['#lines'], ceneton['#lines'], edbo['#lines'])
print "P value ANOVA: {:10.10f}\n".format(p_val)
result = sm.stats.multicomp.pairwise_tukeyhsd(df.get('#lines'), df.corpus)
print(result.summary())
In [19]:
data = [complete['avg_labels'], combined['avg_labels'], annotation['avg_labels'], corpus_big['avg_labels'], ceneton['avg_labels'], edbo['avg_labels']]
plt.boxplot(data)
plt.xticks([1,2,3,4,5,6],['Complete', 'Combined', 'Annotation', 'Corpus big','Ceneton','EDBO']);
In [20]:
f_val, p_val = stats.f_oneway(complete['avg_labels'], combined['avg_labels'], annotation['avg_labels'], corpus_big['avg_labels'], ceneton['avg_labels'], edbo['avg_labels'])
print "P value ANOVA: {:10.10f}\n".format(p_val)
result = sm.stats.multicomp.pairwise_tukeyhsd(df.get('avg_labels'), df.corpus)
print(result.summary())
In [21]:
data = [complete['#emotional'], combined['#emotional'], annotation['#emotional'], corpus_big['#emotional'], ceneton['#emotional'], edbo['#emotional']]
plt.boxplot(data)
plt.xticks([1,2,3,4,5,6],['Complete', 'Combined', 'Annotation', 'Corpus big','Ceneton','EDBO']);
In [22]:
f_val, p_val = stats.f_oneway(annotation['#emotional'], corpus_big['#emotional'], ceneton['#emotional'], edbo['#emotional'])
print "P value ANOVA: {:10.10f}\n".format(p_val)
result = sm.stats.multicomp.pairwise_tukeyhsd(df.get('avg_labels'), df.corpus)
print(result.summary())
In [24]:
# load label names
import itertools
from embem.emotools.heem_utils import heem_emotion_labels, heem_body_part_labels
ebp_labels = ['{}_{}'.format(e, bp) for e, bp in list(itertools.product(heem_emotion_labels, heem_body_part_labels))]
def count_pairs(row):
#print row['Achterdocht_Arms']
#print row.index
return np.sum([row[p] for p in ebp_labels if p in row.index])
complete.loc[:, '#pairs'] = complete.apply(count_pairs, axis=1)
combined.loc[:, '#pairs'] = combined.apply(count_pairs, axis=1)
In [25]:
# Save datasets to file (for easy loading)
annotation.to_csv(os.path.join(output_dir, 'annotation.csv'))
corpus_big.to_csv(os.path.join(output_dir, 'corpus_big.csv'))
ceneton.to_csv(os.path.join(output_dir, 'ceneton.csv'))
edbo.to_csv(os.path.join(output_dir, 'edbo.csv'))
combined.to_csv(os.path.join(output_dir, 'combined.csv'))
complete.to_csv(os.path.join(output_dir, 'complete.csv'))