In [1]:

    
# coding: utf-8
import pandas as pd
from math import log
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
%matplotlib inline

Loading the dataframes

Fill dict df_dict with dataframes with keys: book_name, e.g. aubonheurdesdames_gender_nosolo



In [2]:

    
books = ['aubonheurdesdames', 'candide', 'lassommoir', 'letourdumondeen80jours', 'madamebovary']
gender_names = ['gender_nosolo', 'gender_solo', 'gender_nosolo_w', 'gender_solo_w']
sentiment_names = ['sentiment_nosolo', 'sentiment_solo']
job_names = ['count_full_const', 'count_full_decr', 'count_expo_const', 'count_expo_decr', 
            'proximity_full_const', 'proximity_full_decr', 'proximity_expo_const', 'proximity_expo_decr',
            'job_full_const', 'job_full_decr', 'job_expo_const', 'job_expo_decr']
frame_names = job_names + gender_names + sentiment_names
suffix = '.csv'
path = 'metadata/'
pred_col = ['count', 'proximity', 'job']


df_dict = {}

def load_frames():
    for b in books:
        for name in frame_names:
            file = path + b + '_' + name + suffix 
            key = b + '_' + name
            df_dict[key] = pd.read_csv(file)
            df_dict[key].drop('Unnamed: 0', axis=1, inplace=True)

            predictor = name.split('_')[0]
            if predictor in pred_col:
                df_dict[key]['Predictor'] = predictor

load_frames()

To save the gender name scores to csv files, execute below



In [11]:

    
for book in books:
    df_dict[book + '_gender_nosolo'][['Character', 'Name_score']].to_csv('metadata/' + book+ '_char_name_scores.csv')

Job

Plot data for job predictor



In [39]:

    
job_cols = df_dict['aubonheurdesdames_count_full_const'].columns
# sentiment_cols = df_dict['letourdumondeen80jours_sentiment_nosolo'].columns



In [40]:

    
palette = sns.color_palette()
sns.set_style('whitegrid')



In [41]:

    
def char_similarity(df, name='', character=''):
    if name:
        # Use mean of col 
        df = df_dict[name]
        df = df.fillna(df.mean())
    if character:
        df = df[df.Character == character]
        
    df = df.groupby(['Rank'])['Similarity'].mean().to_frame()
        
    return df

WINDOW = 5



In [42]:

    
df1 = pd.DataFrame(columns=job_cols)
df2 = pd.DataFrame(columns=job_cols)
df3 = pd.DataFrame(columns=job_cols)
plot_names = ['_count_full_decr', '_proximity_full_decr', '_job_full_decr']

for book in books: 
    df1 = df1.append(df_dict[book+plot_names[0]])
    df2 = df2.append(df_dict[book+plot_names[1]])
    df3 = df3.append(df_dict[book+plot_names[2]])

df1 = df1.fillna(df1.mean())
df2 = df2.fillna(df2.mean())
df3 = df3.fillna(df3.mean())

df = df1.append(df2)
df = df.append(df3)

fig, ax = plt.subplots(figsize=(12,8))

sns.pointplot(x='Rank', y="Similarity", hue="Predictor", linestyles='--', data=df, dodge=True)
ax.legend(fontsize=20)
plt.xlabel('Rank', fontsize=20)
plt.ylabel('Similarity', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18);



In [43]:

    
len(df.Character.unique())









    Out[43]:





108



In [44]:

    
df1 = pd.DataFrame(columns=job_cols)
df2 = pd.DataFrame(columns=job_cols)
df3 = pd.DataFrame(columns=job_cols)
df4 = pd.DataFrame(columns=job_cols)
plot_names = ['_job_full_const','_job_full_decr', '_job_expo_const', '_job_expo_decr']

for book in books: 
    df1 = df1.append(df_dict[book+plot_names[0]])
    df2 = df2.append(df_dict[book+plot_names[1]])
    df3 = df3.append(df_dict[book+plot_names[2]])
    df4 = df4.append(df_dict[book+plot_names[3]])


df1 = df1.fillna(df1.mean())
df2 = df2.fillna(df2.mean())
df3 = df3.fillna(df3.mean())
df4 = df4.fillna(df4.mean())


df1['Predictor'] = 'full-const'
df2['Predictor'] = 'full-decreasing'
df3['Predictor'] = 'exposition-const'
df4['Predictor'] = 'exposition-decreasing'


df = df1.append(df2)
df = df.append(df3)
df = df.append(df4)

fig, ax = plt.subplots(figsize=(12,8))

sns.pointplot(x='Rank', y="Similarity", hue="Predictor", data=df, linestyles='--', dodge=True)

ax.legend(fontsize=20)
plt.xlabel('Rank', fontsize=20)
plt.ylabel('Similarity', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18);



In [45]:

    
df1 = df1[df1.Rank < 5]
len(df1[df1.Similarity == 1.0].Character.unique())









    Out[45]:





77



In [46]:

    
book = 'aubonheurdesdames'
name1 = book + '_count_full_const'
name2 = book + '_proximity_full_const'

plotter = df_dict[name1].append(df_dict[name2])
plotter = plotter[plotter.Rank < 3]
fig, ax = plt.subplots(figsize=(14,10))

sns.swarmplot(
    x='Character',
    y='Similarity',
    hue='Predictor',
    palette={'count': palette[0], 'proximity': palette[2]},
    data=plotter,
    ax=ax, 
    size=8)
plt.xticks(rotation=90)
plt.suptitle(book, fontsize=18)
ax.legend(
   loc='best',
   fontsize=14)
ax.tick_params(labelsize=14)
plt.xlabel('Character', fontsize=16)
plt.ylabel('Similarity score', fontsize=16)
plt.show()



In [47]:

    
fig, ax = plt.subplots(figsize=(15,10))
book = 'aubonheurdesdames'
name = book + '_count_full_const'
sns.swarmplot(
    x='Character',
    y='Similarity',
    data=df_dict[name],
    hue='Rank',
    ax=ax)
ax.legend(
   loc='best',
   fontsize=14)
ax.tick_params(labelsize=14)
plt.xlabel('Character', fontsize=16)
plt.ylabel('Similarity score', fontsize=16)
plt.xticks(rotation=90)
plt.show()

Gender



In [48]:

    
load_frames()



In [49]:

    
gender_cols = df_dict['aubonheurdesdames_gender_nosolo'].columns
gender_cols









    Out[49]:





Index(['Character', 'Label', 'Prediction', 'Score', 'Title_score',
       'Title_in_name', 'Adj_score', 'Pron_score', 'Art_score', 'Name_score'],
      dtype='object')



In [50]:

    
to_drop = ['Character', 'Prediction', 'Score', 'Label']



In [51]:

    
df_gender = pd.DataFrame(columns=gender_cols)
for book in books:
    name = 'gender_solo'
    df_key = book + '_' + name
    df_gender = df_gender.append(df_dict[df_key])
        
# reassign label
y = df_gender.Label
y = y.apply(lambda x: 1 if x == 'f' else 0 ) 
# drop columns
df_gender.drop(to_drop, axis=1, inplace=True)
print(df_gender.columns)
# scale features
scaler = StandardScaler()
df_gender = scaler.fit_transform(df_gender)









    



Index(['Title_score', 'Title_in_name', 'Adj_score', 'Pron_score', 'Art_score',
       'Name_score'],
      dtype='object')



In [52]:

    
lr = LogisticRegression(max_iter=3000)

model = lr.fit(df_gender, y)
model.coef_









    Out[52]:





array([[ 1.5588331 ,  1.76710963,  0.75286715,  0.36801215,  0.6566884 ,
         1.63657683]])



In [53]:

    
sgd = SGDClassifier(loss='log', n_iter=1000)
model = sgd.fit(df_gender, y)
model.coef_









    Out[53]:





array([[ 2.30452409,  3.5210679 ,  1.17069687,  0.38949494,  1.52449719,
         2.13227496]])



In [54]:

    
df_joined_gend_preds = {}
for book in books:
    df_joined_gend_preds[book] = df_dict[book + '_' + gender_names[0]].join(
                    df_dict[book + '_' + gender_names[1]], lsuffix='_nosolo', rsuffix='_solo')
    
df_append_gender = pd.DataFrame(columns=df_joined_gend_preds['candide'].columns)
for k, v in df_joined_gend_preds.items():
    df_append_gender = df_append_gender.append(v)
        
y = df_append_gender.Label_nosolo 
app_to_drop = list(map(lambda x: x + '_nosolo', to_drop))
app_to_drop.extend(list(map(lambda x: x + '_solo', to_drop)))
df_append_gender.drop(app_to_drop, axis=1, inplace=True)

y = y.apply(lambda x: 1 if x == 'f' else 0 )



In [55]:

    
df_append_gender.columns









    Out[55]:





Index(['Title_score_nosolo', 'Title_in_name_nosolo', 'Adj_score_nosolo',
       'Pron_score_nosolo', 'Art_score_nosolo', 'Name_score_nosolo',
       'Title_score_solo', 'Title_in_name_solo', 'Adj_score_solo',
       'Pron_score_solo', 'Art_score_solo', 'Name_score_solo'],
      dtype='object')



In [56]:

    
df_append_gender = scaler.fit_transform(df_append_gender)



In [57]:

    
model = lr.fit(df_append_gender, y)
model.coef_









    Out[57]:





array([[ 0.57685525,  1.03790065, -0.31592233,  0.70378482,  0.47732604,
         0.88937771,  1.25842376,  1.03790065,  0.92622479, -0.08157582,
         0.34194243,  0.88937771]])



In [58]:

    
model = sgd.fit(df_append_gender, y)
model.coef_









    Out[58]:





array([[-1.39786918,  2.07074811, -0.88366027,  1.35936419,  1.14580895,
         1.26845868,  3.87666763,  2.07074811,  1.86531963, -0.37126775,
         1.05789118,  1.26845868]])

With weights



In [62]:

    
load_frames()



In [87]:

    
def class_metrics(name, print_=True):
    df_gender = pd.DataFrame(columns=gender_cols)
    for book in books:
        df_key = book + '_' + name
        df_gender = df_gender.append(df_dict[df_key])

    y = df_gender[['Label', 'Prediction']]
    print('full size', y.shape[0])
    print('predicted size', df_gender.dropna().shape[0])

    y_temp = y[y.Label == 'f']
    tp = y_temp[y_temp.Prediction == 'f'].shape[0]
    fn = y_temp[y_temp.Prediction == 'm'].shape[0]

    y_temp = y[y.Label == 'm']
    tn = y_temp[y_temp.Prediction == 'm'].shape[0]
    fp = y_temp[y_temp.Prediction == 'f'].shape[0]

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    acc = (tp + tn) / (tp + tn + fp + fn)
    fscore = 2 * (precision * recall) / (precision + recall)
    
    if print_:
        print('accuracy:{a} \nprecision:{p} \nrecall:{r} \nfscore:{f}'.format(
                a=acc, p=precision, r=recall, f=fscore))



In [88]:

    
class_metrics('gender_nosolo')









    



full size 152
predicted size 126
accuracy:0.873015873015873 
precision:0.7580645161290323 
recall:0.9791666666666666 
fscore:0.8545454545454544



In [69]:

    
class_metrics('gender_nosolo_w')









    



full size 152
predicted size 125
accuracy:0.88 
precision:0.7666666666666667 
recall:0.9787234042553191 
fscore:0.8598130841121495



In [70]:

    
class_metrics('gender_solo')









    



full size 152
predicted size 123
accuracy:0.8780487804878049 
precision:0.7931034482758621 
recall:0.9387755102040817 
fscore:0.8598130841121495



In [71]:

    
class_metrics('gender_solo_w')









    



full size 152
predicted size 123
accuracy:0.8699186991869918 
precision:0.7796610169491526 
recall:0.9387755102040817 
fscore:0.8518518518518519

Plot



In [72]:

    
df_gender = pd.DataFrame(columns=gender_cols)
for book in books:
    df_key = book + '_gender_nosolo_w'
    df_gender = df_gender.append(df_dict[df_key])

y = df_gender[['Label', 'Prediction']]
y = y.dropna()



In [73]:

    
y.loc[y.Label == y.Prediction, 'Class'] = 'correct'
y.loc[y.Label != y.Prediction, 'Class'] = 'wrong'



In [74]:

    
true = y.Label
true = true.to_frame()
true.columns = ['Gender']
true['Label'] = 'True'
pred = y.Prediction
pred = pred.to_frame()
pred.columns = ['Gender']
pred['Label'] = 'Predicted'



In [75]:

    
plotter = true.append(pred)
plotter.columns









    Out[75]:





Index(['Gender', 'Label'], dtype='object')



In [80]:

    
fig, ax = plt.subplots(figsize=(12,10))
sns.countplot(x='Prediction', hue='Class', data=y, palette=[palette[0], palette[2]])
ax.legend(
   loc='best',
   fontsize=20)
ax.tick_params(labelsize=18)
plt.xlabel('Gender', fontsize=20)
plt.ylabel('Count', fontsize=20);



In [81]:

    
fig, ax = plt.subplots(figsize=(12,10))
sns.countplot(x='Gender', hue='Label', data=plotter)
ax.legend(
   loc='best',
   fontsize=20)
ax.tick_params(labelsize=18)
plt.xlabel('Gender', fontsize=20)
plt.ylabel('Count', fontsize=16);

Sentiment



In [89]:

    
sentiment_cols = df_dict['aubonheurdesdames_sentiment_nosolo'].columns
sentiment_cols









    Out[89]:





Index(['Character', 'Label', 'Pos_count', 'Pos_prob', 'Neg_count', 'Neg_prob',
       'Neut_count', 'Neut_prob'],
      dtype='object')



In [90]:

    
sentiment_names









    Out[90]:





['sentiment_nosolo', 'sentiment_solo']



In [91]:

    
stacked_sent = pd.DataFrame(columns=sentiment_cols)
name = sentiment_names[0]
for book in books:
    key = book + '_' + name
    stacked_sent = stacked_sent.append(df_dict[key])



In [4]:

    
def div_total(df):
    df['Total_count'] = df['Pos_count'] + df['Neg_count'] + df['Neut_count']
    temp = df[['Pos_count', 'Neg_count', 'Neut_count']].div(df.Total_count, axis=0)
    df.Pos_count = temp.Pos_count
    df.Neg_count = temp.Neg_count
    df.Neut_count = temp.Neut_count



In [93]:

    
div_total(stacked_sent)



In [94]:

    
pos = stacked_sent.Pos_count.to_frame()
neg = stacked_sent.Neg_count.to_frame()
neut = stacked_sent.Neut_count.to_frame()
pos.columns = ['Count']
neg.columns = ['Count']
neut.columns = ['Count']


pos['Sentiment'] = 'Pos'
neg['Sentiment'] = 'Neg'
neut['Sentiment'] = 'Neut'



In [95]:

    
pos = pos.append(neg).append(neut)



In [117]:

    
ax, fig = plt.subplots(figsize=(12,10))
sns.barplot(x='Sentiment', y='Count', data=pos, palette=[palette[1], palette[2], palette[0]])
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.xlabel('Sentiment', fontsize=20)
plt.ylabel('mean(Count)', fontsize=20);



In [97]:

    
abdd_top = pd.read_csv('metadata/aubonheurdesdames_sentiment_nosolo_top.csv')
div_total(abdd_top)



In [98]:

    
abdd_top[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']]









    Out[98]:






  
    
      
      Character
      Label
      Pos_count
      Neg_count
      Neut_count
      Total_count
    
  
  
    
      0
      Denise
      pos
      0.074364
      0.048924
      0.876712
      511.0
    
    
      1
      Mouret
      pos
      0.097561
      0.043360
      0.859079
      369.0
    
    
      2
      Hutin
      neutral
      0.032258
      0.032258
      0.935484
      124.0
    
    
      3
      Bourdoncle
      pos
      0.085470
      0.042735
      0.871795
      117.0



In [99]:

    
stacked_sent[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']].head(10)









    Out[99]:






  
    
      
      Character
      Label
      Pos_count
      Neg_count
      Neut_count
      Total_count
    
  
  
    
      0
      Denise
      neutral
      0.058824
      0.058824
      0.882353
      51.0
    
    
      1
      Mouret
      pos
      0.222222
      0.055556
      0.722222
      36.0
    
    
      2
      Hutin
      neutral
      0.000000
      0.000000
      1.000000
      12.0
    
    
      3
      Bourdoncle
      neutral
      0.000000
      0.000000
      1.000000
      11.0
    
    
      4
      MmeDesforges
      pos
      0.200000
      0.000000
      0.800000
      10.0
    
    
      5
      Baudu
      neutral
      0.000000
      0.000000
      1.000000
      10.0
    
    
      6
      MmeAurélie
      neutral
      0.100000
      0.100000
      0.800000
      10.0
    
    
      7
      Pauline
      pos
      0.300000
      0.000000
      0.700000
      10.0
    
    
      8
      Favier
      neutral
      0.000000
      0.000000
      1.000000
      10.0
    
    
      9
      Robineau
      pos
      0.200000
      0.000000
      0.800000
      10.0

Tourdumonde



In [100]:

    
load_frames()



In [101]:

    
tdm_top = pd.read_csv('metadata/letourdumondeen80jours_sentiment_nosolo_top.csv')
div_total(tdm_top)



In [102]:

    
tdm_top[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']]









    Out[102]:






  
    
      
      Character
      Label
      Pos_count
      Neg_count
      Neut_count
      Total_count
    
  
  
    
      0
      Passepartout
      pos
      0.106965
      0.037313
      0.855721
      402.0
    
    
      1
      PhileasFogg
      pos
      0.092857
      0.017857
      0.889286
      280.0
    
    
      2
      MrFogg
      pos
      0.049242
      0.011364
      0.939394
      264.0
    
    
      3
      Fix
      pos
      0.071713
      0.035857
      0.892430
      251.0
    
    
      4
      MrsAouda
      pos
      0.062016
      0.038760
      0.899225
      129.0



In [103]:

    
tdm = df_dict['letourdumondeen80jours_sentiment_nosolo']
div_total(tdm)
tdm.head()









    Out[103]:






  
    
      
      Character
      Label
      Pos_count
      Pos_prob
      Neg_count
      Neg_prob
      Neut_count
      Neut_prob
      Total_count
    
  
  
    
      0
      Passepartout
      pos
      0.125000
      19.637560
      0.000000
      20.362440
      0.875000
      34.087500
      40.0
    
    
      1
      PhileasFogg
      pos
      0.107143
      13.642083
      0.035714
      14.357917
      0.857143
      22.708333
      28.0
    
    
      2
      MrFogg
      pos
      0.038462
      12.511667
      0.000000
      13.488333
      0.961538
      22.768333
      26.0
    
    
      3
      Fix
      pos
      0.080000
      11.602000
      0.040000
      13.398000
      0.880000
      19.806667
      25.0
    
    
      4
      MrsAouda
      neg
      0.083333
      5.898333
      0.166667
      6.101667
      0.750000
      8.950000
      12.0

L'assommoir



In [19]:

    
lass_top = pd.read_csv('metadata/lassommoir_sentiment_nosolo_top.csv')
div_total(lass_top)



In [21]:

    
lass_top[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']]









    Out[21]:






  
    
      
      Character
      Label
      Pos_count
      Neg_count
      Neut_count
      Total_count
    
  
  
    
      0
      Henriette
      pos
      0.166667
      0.0
      0.833333
      6.0
    
    
      1
      Auguste
      pos
      0.166667
      0.0
      0.833333
      6.0
    
    
      2
      Hardi
      pos
      0.200000
      0.0
      0.800000
      5.0
    
    
      3
      Baquet
      pos
      0.400000
      0.0
      0.600000
      5.0

LIWC scores



In [21]:

    
abdd.columns









    Out[21]:





Index(['Seg', 'WC', 'WPS', 'Sixltr', 'Dic', 'Numerals', 'fonction', 'pronom',
       'pronomp', 'je', 'nous', 'vous', 'il', 'ils', 'pronomimp', 'article',
       'verbe', 'verbeauxi', 'verbepassÈ', 'verbeprÈsent', 'verbefutur',
       'adverbe', 'prÈposition', 'conjonction', 'nÈgation', 'quantifieur',
       'nombre', 'juron', 'social', 'famille', 'ami', 'humain', 'affect',
       'Èmopos', 'ÈmonÈg', 'anxiÈtÈ', 'colËre', 'tristesse', 'cognition',
       'perspicacitÈ', 'cause', 'divergence', 'tentative', 'certitude',
       'inhibition', 'inclusion', 'exclusion', 'perception', 'voir',
       'entendre', 'sentir', 'biologique', 'corps', 'santÈ', 'sexualitÈ',
       'alimentation', 'relativitÈ', 'mouvement', 'espace', 'temps', 'travail',
       'accomplissement', 'loisir', 'maison', 'argent', 'religion', 'mort',
       'consentement', 'hÈsitation', 'remplisseur', 'Period', 'Comma', 'Colon',
       'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth',
       'OtherP', 'AllPct'],
      dtype='object')



In [8]:

    
abdd = pd.DataFrame.from_csv('LIWC/aubonheurdesdames.txt', sep='\t')
abdd[['affect', 'Èmopos', 'ÈmonÈg']]









    Out[8]:






  
    
      
      affect
      Èmopos
      ÈmonÈg
    
    
      Filename
      
      
      
    
  
  
    
      Bourdoncle.txt
      9.62
      5.09
      4.04
    
    
      Denise.txt
      10.27
      5.99
      3.88
    
    
      Hutin.txt
      8.35
      4.46
      3.38
    
    
      MmeDesforges.txt
      9.66
      6.55
      3.19
    
    
      Mouret.txt
      11.18
      5.81
      4.87
    
    
      Untitled.txt
      2.01
      0.60
      1.20



In [13]:

    
abdd = pd.DataFrame.from_csv('LIWC-raw/aubonheurdesdames.txt', sep='\t')
abdd[['affect', 'Èmopos', 'ÈmonÈg']]









    Out[13]:






  
    
      
      affect
      Èmopos
      ÈmonÈg
    
    
      Filename
      
      
      
    
  
  
    
      Bourdoncle.txt
      5.31
      2.84
      2.20
    
    
      Denise.txt
      5.47
      3.19
      2.06
    
    
      Hutin.txt
      4.70
      2.55
      1.87
    
    
      MmeDesforges.txt
      5.23
      3.53
      1.74
    
    
      Mouret.txt
      6.02
      3.13
      2.61
    
    
      Untitled.txt
      2.01
      0.60
      1.20



In [18]:

    
mbov = pd.DataFrame.from_csv('LIWC/madamebovary.txt', sep='\t')
mbov[['affect', 'Èmopos', 'ÈmonÈg']]









    Out[18]:






  
    
      
      affect
      Èmopos
      ÈmonÈg
    
    
      Filename
      
      
      
    
  
  
    
      Bovary.txt
      8.76
      5.33
      3.29
    
    
      Charles.txt
      8.88
      4.65
      3.84
    
    
      Emma.txt
      10.31
      5.89
      4.25
    
    
      Léon.txt
      9.21
      5.85
      3.43
    
    
      Rodolphe.txt
      9.32
      5.33
      3.61
    
    
      Untitled.txt
      2.01
      0.60
      1.20



In [14]:

    
mbov = pd.DataFrame.from_csv('LIWC-raw/madamebovary.txt', sep='\t')
mbov[['affect', 'Èmopos', 'ÈmonÈg']]









    Out[14]:






  
    
      
      affect
      Èmopos
      ÈmonÈg
    
    
      Filename
      
      
      
    
  
  
    
      Bovary.txt
      4.74
      2.84
      1.76
    
    
      Charles.txt
      4.73
      2.46
      2.03
    
    
      Emma.txt
      5.55
      3.16
      2.28
    
    
      Léon.txt
      4.88
      3.08
      1.81
    
    
      Rodolphe.txt
      5.13
      2.87
      1.94
    
    
      Untitled.txt
      2.01
      0.60
      1.20



In [10]:

    
lass = pd.DataFrame.from_csv('LIWC/lassommoir.txt', sep='\t')
lass[['affect', 'Èmopos', 'ÈmonÈg']]









    Out[10]:






  
    
      
      affect
      Èmopos
      ÈmonÈg
    
    
      Filename
      
      
      
    
  
  
    
      Boche.txt
      7.00
      4.60
      2.50
    
    
      Coupeau.txt
      8.34
      4.43
      3.82
    
    
      Gervaise.txt
      8.96
      4.96
      3.90
    
    
      Lantier.txt
      9.82
      5.84
      3.70
    
    
      Lorilleux.txt
      7.93
      4.33
      3.53
    
    
      Untitled.txt
      2.01
      0.60
      1.20



In [15]:

    
lass = pd.DataFrame.from_csv('LIWC-raw/lassommoir.txt', sep='\t')
lass[['affect', 'Èmopos', 'ÈmonÈg']]









    Out[15]:






  
    
      
      affect
      Èmopos
      ÈmonÈg
    
    
      Filename
      
      
      
    
  
  
    
      Boche.txt
      3.66
      2.39
      1.30
    
    
      Coupeau.txt
      4.57
      2.42
      2.10
    
    
      Gervaise.txt
      4.88
      2.68
      2.13
    
    
      Lantier.txt
      5.18
      3.10
      1.94
    
    
      Lorilleux.txt
      4.32
      2.36
      1.92
    
    
      Untitled.txt
      2.01
      0.60
      1.20



In [11]:

    
ltdm = pd.DataFrame.from_csv('LIWC/letourdumondeen80jours.txt', sep='\t')
ltdm[['affect', 'Èmopos', 'ÈmonÈg']]









    Out[11]:






  
    
      
      affect
      Èmopos
      ÈmonÈg
    
    
      Filename
      
      
      
    
  
  
    
      Fix.txt
      8.65
      5.92
      2.57
    
    
      MrFogg.txt
      8.62
      5.83
      2.63
    
    
      MrsAouda.txt
      9.00
      6.47
      2.60
    
    
      Passepartout.txt
      7.97
      4.81
      2.87
    
    
      PhileasFogg.txt
      7.38
      4.89
      2.46
    
    
      Untitled.txt
      2.01
      0.60
      1.20



In [16]:

    
ltdm = pd.DataFrame.from_csv('LIWC-raw/letourdumondeen80jours.txt', sep='\t')
ltdm[['affect', 'Èmopos', 'ÈmonÈg']]









    Out[16]:






  
    
      
      affect
      Èmopos
      ÈmonÈg
    
    
      Filename
      
      
      
    
  
  
    
      Fix.txt
      4.68
      3.22
      1.38
    
    
      MrFogg.txt
      4.53
      3.08
      1.37
    
    
      MrsAouda.txt
      4.52
      3.26
      1.30
    
    
      Passepartout.txt
      4.53
      2.74
      1.61
    
    
      PhileasFogg.txt
      3.85
      2.55
      1.28
    
    
      Untitled.txt
      2.01
      0.60
      1.20



In [12]:

    
cand = pd.DataFrame.from_csv('LIWC/candide.txt', sep='\t')
cand[['affect', 'Èmopos', 'ÈmonÈg']]









    Out[12]:






  
    
      
      affect
      Èmopos
      ÈmonÈg
    
    
      Filename
      
      
      
    
  
  
    
      Cacambo.txt
      8.58
      6.07
      2.34
    
    
      Candide.txt
      9.87
      6.94
      2.78
    
    
      Cunégonde.txt
      11.21
      7.86
      3.22
    
    
      Martin.txt
      9.34
      6.45
      2.81
    
    
      Pangloss.txt
      10.57
      7.09
      3.35
    
    
      Untitled.txt
      2.01
      0.60
      1.20



In [17]:

    
cand = pd.DataFrame.from_csv('LIWC-raw/candide.txt', sep='\t')
cand[['affect', 'Èmopos', 'ÈmonÈg']]









    Out[17]:






  
    
      
      affect
      Èmopos
      ÈmonÈg
    
    
      Filename
      
      
      
    
  
  
    
      Cacambo.txt
      4.39
      3.10
      1.20
    
    
      Candide.txt
      5.30
      3.73
      1.49
    
    
      Cunégonde.txt
      5.87
      4.12
      1.69
    
    
      Martin.txt
      4.93
      3.40
      1.48
    
    
      Pangloss.txt
      5.81
      3.90
      1.84
    
    
      Untitled.txt
      2.01
      0.60
      1.20



In [ ]:

	Character	Label	Pos_count	Neg_count	Neut_count	Total_count
0	Denise	pos	0.074364	0.048924	0.876712	511.0
1	Mouret	pos	0.097561	0.043360	0.859079	369.0
2	Hutin	neutral	0.032258	0.032258	0.935484	124.0
3	Bourdoncle	pos	0.085470	0.042735	0.871795	117.0

	Character	Label	Pos_count	Neg_count	Neut_count	Total_count
0	Denise	neutral	0.058824	0.058824	0.882353	51.0
1	Mouret	pos	0.222222	0.055556	0.722222	36.0
2	Hutin	neutral	0.000000	0.000000	1.000000	12.0
3	Bourdoncle	neutral	0.000000	0.000000	1.000000	11.0
4	MmeDesforges	pos	0.200000	0.000000	0.800000	10.0
5	Baudu	neutral	0.000000	0.000000	1.000000	10.0
6	MmeAurélie	neutral	0.100000	0.100000	0.800000	10.0
7	Pauline	pos	0.300000	0.000000	0.700000	10.0
8	Favier	neutral	0.000000	0.000000	1.000000	10.0
9	Robineau	pos	0.200000	0.000000	0.800000	10.0

	Character	Label	Pos_count	Neg_count	Neut_count	Total_count
0	Passepartout	pos	0.106965	0.037313	0.855721	402.0
1	PhileasFogg	pos	0.092857	0.017857	0.889286	280.0
2	MrFogg	pos	0.049242	0.011364	0.939394	264.0
3	Fix	pos	0.071713	0.035857	0.892430	251.0
4	MrsAouda	pos	0.062016	0.038760	0.899225	129.0

	Character	Label	Pos_count	Pos_prob	Neg_count	Neg_prob	Neut_count	Neut_prob	Total_count
0	Passepartout	pos	0.125000	19.637560	0.000000	20.362440	0.875000	34.087500	40.0
1	PhileasFogg	pos	0.107143	13.642083	0.035714	14.357917	0.857143	22.708333	28.0
2	MrFogg	pos	0.038462	12.511667	0.000000	13.488333	0.961538	22.768333	26.0
3	Fix	pos	0.080000	11.602000	0.040000	13.398000	0.880000	19.806667	25.0
4	MrsAouda	neg	0.083333	5.898333	0.166667	6.101667	0.750000	8.950000	12.0

	Character	Label	Pos_count	Neut_count	Total_count
0	Henriette	pos	0.166667	0.833333	6.0
1	Auguste	pos	0.166667	0.833333	6.0
2	Hardi	pos	0.200000	0.800000	5.0
3	Baquet	pos	0.400000	0.600000	5.0

	affect	Èmopos	ÈmonÈg
Filename
Bourdoncle.txt	9.62	5.09	4.04
Denise.txt	10.27	5.99	3.88
Hutin.txt	8.35	4.46	3.38
MmeDesforges.txt	9.66	6.55	3.19
Mouret.txt	11.18	5.81	4.87
Untitled.txt	2.01	0.60	1.20

	affect	Èmopos	ÈmonÈg
Filename
Bourdoncle.txt	5.31	2.84	2.20
Denise.txt	5.47	3.19	2.06
Hutin.txt	4.70	2.55	1.87
MmeDesforges.txt	5.23	3.53	1.74
Mouret.txt	6.02	3.13	2.61
Untitled.txt	2.01	0.60	1.20

	affect	Èmopos	ÈmonÈg
Filename
Bovary.txt	8.76	5.33	3.29
Charles.txt	8.88	4.65	3.84
Emma.txt	10.31	5.89	4.25
Léon.txt	9.21	5.85	3.43
Rodolphe.txt	9.32	5.33	3.61
Untitled.txt	2.01	0.60	1.20

	affect	Èmopos	ÈmonÈg
Filename
Bovary.txt	4.74	2.84	1.76
Charles.txt	4.73	2.46	2.03
Emma.txt	5.55	3.16	2.28
Léon.txt	4.88	3.08	1.81
Rodolphe.txt	5.13	2.87	1.94
Untitled.txt	2.01	0.60	1.20

	affect	Èmopos	ÈmonÈg
Filename
Boche.txt	7.00	4.60	2.50
Coupeau.txt	8.34	4.43	3.82
Gervaise.txt	8.96	4.96	3.90
Lantier.txt	9.82	5.84	3.70
Lorilleux.txt	7.93	4.33	3.53
Untitled.txt	2.01	0.60	1.20

	affect	Èmopos	ÈmonÈg
Filename
Boche.txt	3.66	2.39	1.30
Coupeau.txt	4.57	2.42	2.10
Gervaise.txt	4.88	2.68	2.13
Lantier.txt	5.18	3.10	1.94
Lorilleux.txt	4.32	2.36	1.92
Untitled.txt	2.01	0.60	1.20

	affect	Èmopos	ÈmonÈg
Filename
Fix.txt	8.65	5.92	2.57
MrFogg.txt	8.62	5.83	2.63
MrsAouda.txt	9.00	6.47	2.60
Passepartout.txt	7.97	4.81	2.87
PhileasFogg.txt	7.38	4.89	2.46
Untitled.txt	2.01	0.60	1.20

	affect	Èmopos	ÈmonÈg
Filename
Fix.txt	4.68	3.22	1.38
MrFogg.txt	4.53	3.08	1.37
MrsAouda.txt	4.52	3.26	1.30
Passepartout.txt	4.53	2.74	1.61
PhileasFogg.txt	3.85	2.55	1.28
Untitled.txt	2.01	0.60	1.20

	affect	Èmopos	ÈmonÈg
Filename
Cacambo.txt	8.58	6.07	2.34
Candide.txt	9.87	6.94	2.78
Cunégonde.txt	11.21	7.86	3.22
Martin.txt	9.34	6.45	2.81
Pangloss.txt	10.57	7.09	3.35
Untitled.txt	2.01	0.60	1.20

	affect	Èmopos	ÈmonÈg
Filename
Cacambo.txt	4.39	3.10	1.20
Candide.txt	5.30	3.73	1.49
Cunégonde.txt	5.87	4.12	1.69
Martin.txt	4.93	3.40	1.48
Pangloss.txt	5.81	3.90	1.84
Untitled.txt	2.01	0.60	1.20