In [1]:
# coding: utf-8
import pandas as pd
from math import log
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
%matplotlib inline

Loading the dataframes

Fill dict df_dict with dataframes with keys: book_name, e.g. aubonheurdesdames_gender_nosolo


In [2]:
books = ['aubonheurdesdames', 'candide', 'lassommoir', 'letourdumondeen80jours', 'madamebovary']
gender_names = ['gender_nosolo', 'gender_solo', 'gender_nosolo_w', 'gender_solo_w']
sentiment_names = ['sentiment_nosolo', 'sentiment_solo']
job_names = ['count_full_const', 'count_full_decr', 'count_expo_const', 'count_expo_decr', 
            'proximity_full_const', 'proximity_full_decr', 'proximity_expo_const', 'proximity_expo_decr',
            'job_full_const', 'job_full_decr', 'job_expo_const', 'job_expo_decr']
frame_names = job_names + gender_names + sentiment_names
suffix = '.csv'
path = 'metadata/'
pred_col = ['count', 'proximity', 'job']


df_dict = {}

def load_frames():
    for b in books:
        for name in frame_names:
            file = path + b + '_' + name + suffix 
            key = b + '_' + name
            df_dict[key] = pd.read_csv(file)
            df_dict[key].drop('Unnamed: 0', axis=1, inplace=True)

            predictor = name.split('_')[0]
            if predictor in pred_col:
                df_dict[key]['Predictor'] = predictor

load_frames()

To save the gender name scores to csv files, execute below


In [11]:
for book in books:
    df_dict[book + '_gender_nosolo'][['Character', 'Name_score']].to_csv('metadata/' + book+ '_char_name_scores.csv')

Job

Plot data for job predictor


In [39]:
job_cols = df_dict['aubonheurdesdames_count_full_const'].columns
# sentiment_cols = df_dict['letourdumondeen80jours_sentiment_nosolo'].columns

In [40]:
palette = sns.color_palette()
sns.set_style('whitegrid')

In [41]:
def char_similarity(df, name='', character=''):
    if name:
        # Use mean of col 
        df = df_dict[name]
        df = df.fillna(df.mean())
    if character:
        df = df[df.Character == character]
        
    df = df.groupby(['Rank'])['Similarity'].mean().to_frame()
        
    return df

WINDOW = 5


In [42]:
df1 = pd.DataFrame(columns=job_cols)
df2 = pd.DataFrame(columns=job_cols)
df3 = pd.DataFrame(columns=job_cols)
plot_names = ['_count_full_decr', '_proximity_full_decr', '_job_full_decr']

for book in books: 
    df1 = df1.append(df_dict[book+plot_names[0]])
    df2 = df2.append(df_dict[book+plot_names[1]])
    df3 = df3.append(df_dict[book+plot_names[2]])

df1 = df1.fillna(df1.mean())
df2 = df2.fillna(df2.mean())
df3 = df3.fillna(df3.mean())

df = df1.append(df2)
df = df.append(df3)

fig, ax = plt.subplots(figsize=(12,8))

sns.pointplot(x='Rank', y="Similarity", hue="Predictor", linestyles='--', data=df, dodge=True)
ax.legend(fontsize=20)
plt.xlabel('Rank', fontsize=20)
plt.ylabel('Similarity', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18);



In [43]:
len(df.Character.unique())


Out[43]:
108

In [44]:
df1 = pd.DataFrame(columns=job_cols)
df2 = pd.DataFrame(columns=job_cols)
df3 = pd.DataFrame(columns=job_cols)
df4 = pd.DataFrame(columns=job_cols)
plot_names = ['_job_full_const','_job_full_decr', '_job_expo_const', '_job_expo_decr']

for book in books: 
    df1 = df1.append(df_dict[book+plot_names[0]])
    df2 = df2.append(df_dict[book+plot_names[1]])
    df3 = df3.append(df_dict[book+plot_names[2]])
    df4 = df4.append(df_dict[book+plot_names[3]])


df1 = df1.fillna(df1.mean())
df2 = df2.fillna(df2.mean())
df3 = df3.fillna(df3.mean())
df4 = df4.fillna(df4.mean())


df1['Predictor'] = 'full-const'
df2['Predictor'] = 'full-decreasing'
df3['Predictor'] = 'exposition-const'
df4['Predictor'] = 'exposition-decreasing'


df = df1.append(df2)
df = df.append(df3)
df = df.append(df4)

fig, ax = plt.subplots(figsize=(12,8))

sns.pointplot(x='Rank', y="Similarity", hue="Predictor", data=df, linestyles='--', dodge=True)

ax.legend(fontsize=20)
plt.xlabel('Rank', fontsize=20)
plt.ylabel('Similarity', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18);



In [45]:
df1 = df1[df1.Rank < 5]
len(df1[df1.Similarity == 1.0].Character.unique())


Out[45]:
77

In [46]:
book = 'aubonheurdesdames'
name1 = book + '_count_full_const'
name2 = book + '_proximity_full_const'

plotter = df_dict[name1].append(df_dict[name2])
plotter = plotter[plotter.Rank < 3]
fig, ax = plt.subplots(figsize=(14,10))

sns.swarmplot(
    x='Character',
    y='Similarity',
    hue='Predictor',
    palette={'count': palette[0], 'proximity': palette[2]},
    data=plotter,
    ax=ax, 
    size=8)
plt.xticks(rotation=90)
plt.suptitle(book, fontsize=18)
ax.legend(
   loc='best',
   fontsize=14)
ax.tick_params(labelsize=14)
plt.xlabel('Character', fontsize=16)
plt.ylabel('Similarity score', fontsize=16)
plt.show()



In [47]:
fig, ax = plt.subplots(figsize=(15,10))
book = 'aubonheurdesdames'
name = book + '_count_full_const'
sns.swarmplot(
    x='Character',
    y='Similarity',
    data=df_dict[name],
    hue='Rank',
    ax=ax)
ax.legend(
   loc='best',
   fontsize=14)
ax.tick_params(labelsize=14)
plt.xlabel('Character', fontsize=16)
plt.ylabel('Similarity score', fontsize=16)
plt.xticks(rotation=90)
plt.show()


Gender


In [48]:
load_frames()

In [49]:
gender_cols = df_dict['aubonheurdesdames_gender_nosolo'].columns
gender_cols


Out[49]:
Index(['Character', 'Label', 'Prediction', 'Score', 'Title_score',
       'Title_in_name', 'Adj_score', 'Pron_score', 'Art_score', 'Name_score'],
      dtype='object')

In [50]:
to_drop = ['Character', 'Prediction', 'Score', 'Label']

In [51]:
df_gender = pd.DataFrame(columns=gender_cols)
for book in books:
    name = 'gender_solo'
    df_key = book + '_' + name
    df_gender = df_gender.append(df_dict[df_key])
        
# reassign label
y = df_gender.Label
y = y.apply(lambda x: 1 if x == 'f' else 0 ) 
# drop columns
df_gender.drop(to_drop, axis=1, inplace=True)
print(df_gender.columns)
# scale features
scaler = StandardScaler()
df_gender = scaler.fit_transform(df_gender)


Index(['Title_score', 'Title_in_name', 'Adj_score', 'Pron_score', 'Art_score',
       'Name_score'],
      dtype='object')

In [52]:
lr = LogisticRegression(max_iter=3000)

model = lr.fit(df_gender, y)
model.coef_


Out[52]:
array([[ 1.5588331 ,  1.76710963,  0.75286715,  0.36801215,  0.6566884 ,
         1.63657683]])

In [53]:
sgd = SGDClassifier(loss='log', n_iter=1000)
model = sgd.fit(df_gender, y)
model.coef_


Out[53]:
array([[ 2.30452409,  3.5210679 ,  1.17069687,  0.38949494,  1.52449719,
         2.13227496]])

In [54]:
df_joined_gend_preds = {}
for book in books:
    df_joined_gend_preds[book] = df_dict[book + '_' + gender_names[0]].join(
                    df_dict[book + '_' + gender_names[1]], lsuffix='_nosolo', rsuffix='_solo')
    
df_append_gender = pd.DataFrame(columns=df_joined_gend_preds['candide'].columns)
for k, v in df_joined_gend_preds.items():
    df_append_gender = df_append_gender.append(v)
        
y = df_append_gender.Label_nosolo 
app_to_drop = list(map(lambda x: x + '_nosolo', to_drop))
app_to_drop.extend(list(map(lambda x: x + '_solo', to_drop)))
df_append_gender.drop(app_to_drop, axis=1, inplace=True)

y = y.apply(lambda x: 1 if x == 'f' else 0 )

In [55]:
df_append_gender.columns


Out[55]:
Index(['Title_score_nosolo', 'Title_in_name_nosolo', 'Adj_score_nosolo',
       'Pron_score_nosolo', 'Art_score_nosolo', 'Name_score_nosolo',
       'Title_score_solo', 'Title_in_name_solo', 'Adj_score_solo',
       'Pron_score_solo', 'Art_score_solo', 'Name_score_solo'],
      dtype='object')

In [56]:
df_append_gender = scaler.fit_transform(df_append_gender)

In [57]:
model = lr.fit(df_append_gender, y)
model.coef_


Out[57]:
array([[ 0.57685525,  1.03790065, -0.31592233,  0.70378482,  0.47732604,
         0.88937771,  1.25842376,  1.03790065,  0.92622479, -0.08157582,
         0.34194243,  0.88937771]])

In [58]:
model = sgd.fit(df_append_gender, y)
model.coef_


Out[58]:
array([[-1.39786918,  2.07074811, -0.88366027,  1.35936419,  1.14580895,
         1.26845868,  3.87666763,  2.07074811,  1.86531963, -0.37126775,
         1.05789118,  1.26845868]])

With weights


In [62]:
load_frames()

In [87]:
def class_metrics(name, print_=True):
    df_gender = pd.DataFrame(columns=gender_cols)
    for book in books:
        df_key = book + '_' + name
        df_gender = df_gender.append(df_dict[df_key])

    y = df_gender[['Label', 'Prediction']]
    print('full size', y.shape[0])
    print('predicted size', df_gender.dropna().shape[0])

    y_temp = y[y.Label == 'f']
    tp = y_temp[y_temp.Prediction == 'f'].shape[0]
    fn = y_temp[y_temp.Prediction == 'm'].shape[0]

    y_temp = y[y.Label == 'm']
    tn = y_temp[y_temp.Prediction == 'm'].shape[0]
    fp = y_temp[y_temp.Prediction == 'f'].shape[0]

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    acc = (tp + tn) / (tp + tn + fp + fn)
    fscore = 2 * (precision * recall) / (precision + recall)
    
    if print_:
        print('accuracy:{a} \nprecision:{p} \nrecall:{r} \nfscore:{f}'.format(
                a=acc, p=precision, r=recall, f=fscore))

In [88]:
class_metrics('gender_nosolo')


full size 152
predicted size 126
accuracy:0.873015873015873 
precision:0.7580645161290323 
recall:0.9791666666666666 
fscore:0.8545454545454544

In [69]:
class_metrics('gender_nosolo_w')


full size 152
predicted size 125
accuracy:0.88 
precision:0.7666666666666667 
recall:0.9787234042553191 
fscore:0.8598130841121495

In [70]:
class_metrics('gender_solo')


full size 152
predicted size 123
accuracy:0.8780487804878049 
precision:0.7931034482758621 
recall:0.9387755102040817 
fscore:0.8598130841121495

In [71]:
class_metrics('gender_solo_w')


full size 152
predicted size 123
accuracy:0.8699186991869918 
precision:0.7796610169491526 
recall:0.9387755102040817 
fscore:0.8518518518518519

Plot


In [72]:
df_gender = pd.DataFrame(columns=gender_cols)
for book in books:
    df_key = book + '_gender_nosolo_w'
    df_gender = df_gender.append(df_dict[df_key])

y = df_gender[['Label', 'Prediction']]
y = y.dropna()

In [73]:
y.loc[y.Label == y.Prediction, 'Class'] = 'correct'
y.loc[y.Label != y.Prediction, 'Class'] = 'wrong'

In [74]:
true = y.Label
true = true.to_frame()
true.columns = ['Gender']
true['Label'] = 'True'
pred = y.Prediction
pred = pred.to_frame()
pred.columns = ['Gender']
pred['Label'] = 'Predicted'

In [75]:
plotter = true.append(pred)
plotter.columns


Out[75]:
Index(['Gender', 'Label'], dtype='object')

In [80]:
fig, ax = plt.subplots(figsize=(12,10))
sns.countplot(x='Prediction', hue='Class', data=y, palette=[palette[0], palette[2]])
ax.legend(
   loc='best',
   fontsize=20)
ax.tick_params(labelsize=18)
plt.xlabel('Gender', fontsize=20)
plt.ylabel('Count', fontsize=20);



In [81]:
fig, ax = plt.subplots(figsize=(12,10))
sns.countplot(x='Gender', hue='Label', data=plotter)
ax.legend(
   loc='best',
   fontsize=20)
ax.tick_params(labelsize=18)
plt.xlabel('Gender', fontsize=20)
plt.ylabel('Count', fontsize=16);


Sentiment


In [89]:
sentiment_cols = df_dict['aubonheurdesdames_sentiment_nosolo'].columns
sentiment_cols


Out[89]:
Index(['Character', 'Label', 'Pos_count', 'Pos_prob', 'Neg_count', 'Neg_prob',
       'Neut_count', 'Neut_prob'],
      dtype='object')

In [90]:
sentiment_names


Out[90]:
['sentiment_nosolo', 'sentiment_solo']

In [91]:
stacked_sent = pd.DataFrame(columns=sentiment_cols)
name = sentiment_names[0]
for book in books:
    key = book + '_' + name
    stacked_sent = stacked_sent.append(df_dict[key])

In [4]:
def div_total(df):
    df['Total_count'] = df['Pos_count'] + df['Neg_count'] + df['Neut_count']
    temp = df[['Pos_count', 'Neg_count', 'Neut_count']].div(df.Total_count, axis=0)
    df.Pos_count = temp.Pos_count
    df.Neg_count = temp.Neg_count
    df.Neut_count = temp.Neut_count

In [93]:
div_total(stacked_sent)

In [94]:
pos = stacked_sent.Pos_count.to_frame()
neg = stacked_sent.Neg_count.to_frame()
neut = stacked_sent.Neut_count.to_frame()
pos.columns = ['Count']
neg.columns = ['Count']
neut.columns = ['Count']


pos['Sentiment'] = 'Pos'
neg['Sentiment'] = 'Neg'
neut['Sentiment'] = 'Neut'

In [95]:
pos = pos.append(neg).append(neut)

In [117]:
ax, fig = plt.subplots(figsize=(12,10))
sns.barplot(x='Sentiment', y='Count', data=pos, palette=[palette[1], palette[2], palette[0]])
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.xlabel('Sentiment', fontsize=20)
plt.ylabel('mean(Count)', fontsize=20);



In [97]:
abdd_top = pd.read_csv('metadata/aubonheurdesdames_sentiment_nosolo_top.csv')
div_total(abdd_top)

In [98]:
abdd_top[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']]


Out[98]:
Character Label Pos_count Neg_count Neut_count Total_count
0 Denise pos 0.074364 0.048924 0.876712 511.0
1 Mouret pos 0.097561 0.043360 0.859079 369.0
2 Hutin neutral 0.032258 0.032258 0.935484 124.0
3 Bourdoncle pos 0.085470 0.042735 0.871795 117.0

In [99]:
stacked_sent[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']].head(10)


Out[99]:
Character Label Pos_count Neg_count Neut_count Total_count
0 Denise neutral 0.058824 0.058824 0.882353 51.0
1 Mouret pos 0.222222 0.055556 0.722222 36.0
2 Hutin neutral 0.000000 0.000000 1.000000 12.0
3 Bourdoncle neutral 0.000000 0.000000 1.000000 11.0
4 MmeDesforges pos 0.200000 0.000000 0.800000 10.0
5 Baudu neutral 0.000000 0.000000 1.000000 10.0
6 MmeAurélie neutral 0.100000 0.100000 0.800000 10.0
7 Pauline pos 0.300000 0.000000 0.700000 10.0
8 Favier neutral 0.000000 0.000000 1.000000 10.0
9 Robineau pos 0.200000 0.000000 0.800000 10.0

Tourdumonde


In [100]:
load_frames()

In [101]:
tdm_top = pd.read_csv('metadata/letourdumondeen80jours_sentiment_nosolo_top.csv')
div_total(tdm_top)

In [102]:
tdm_top[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']]


Out[102]:
Character Label Pos_count Neg_count Neut_count Total_count
0 Passepartout pos 0.106965 0.037313 0.855721 402.0
1 PhileasFogg pos 0.092857 0.017857 0.889286 280.0
2 MrFogg pos 0.049242 0.011364 0.939394 264.0
3 Fix pos 0.071713 0.035857 0.892430 251.0
4 MrsAouda pos 0.062016 0.038760 0.899225 129.0

In [103]:
tdm = df_dict['letourdumondeen80jours_sentiment_nosolo']
div_total(tdm)
tdm.head()


Out[103]:
Character Label Pos_count Pos_prob Neg_count Neg_prob Neut_count Neut_prob Total_count
0 Passepartout pos 0.125000 19.637560 0.000000 20.362440 0.875000 34.087500 40.0
1 PhileasFogg pos 0.107143 13.642083 0.035714 14.357917 0.857143 22.708333 28.0
2 MrFogg pos 0.038462 12.511667 0.000000 13.488333 0.961538 22.768333 26.0
3 Fix pos 0.080000 11.602000 0.040000 13.398000 0.880000 19.806667 25.0
4 MrsAouda neg 0.083333 5.898333 0.166667 6.101667 0.750000 8.950000 12.0

L'assommoir


In [19]:
lass_top = pd.read_csv('metadata/lassommoir_sentiment_nosolo_top.csv')
div_total(lass_top)

In [21]:
lass_top[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']]


Out[21]:
Character Label Pos_count Neg_count Neut_count Total_count
0 Henriette pos 0.166667 0.0 0.833333 6.0
1 Auguste pos 0.166667 0.0 0.833333 6.0
2 Hardi pos 0.200000 0.0 0.800000 5.0
3 Baquet pos 0.400000 0.0 0.600000 5.0

LIWC scores


In [21]:
abdd.columns


Out[21]:
Index(['Seg', 'WC', 'WPS', 'Sixltr', 'Dic', 'Numerals', 'fonction', 'pronom',
       'pronomp', 'je', 'nous', 'vous', 'il', 'ils', 'pronomimp', 'article',
       'verbe', 'verbeauxi', 'verbepassÈ', 'verbeprÈsent', 'verbefutur',
       'adverbe', 'prÈposition', 'conjonction', 'nÈgation', 'quantifieur',
       'nombre', 'juron', 'social', 'famille', 'ami', 'humain', 'affect',
       'Èmopos', 'ÈmonÈg', 'anxiÈtÈ', 'colËre', 'tristesse', 'cognition',
       'perspicacitÈ', 'cause', 'divergence', 'tentative', 'certitude',
       'inhibition', 'inclusion', 'exclusion', 'perception', 'voir',
       'entendre', 'sentir', 'biologique', 'corps', 'santÈ', 'sexualitÈ',
       'alimentation', 'relativitÈ', 'mouvement', 'espace', 'temps', 'travail',
       'accomplissement', 'loisir', 'maison', 'argent', 'religion', 'mort',
       'consentement', 'hÈsitation', 'remplisseur', 'Period', 'Comma', 'Colon',
       'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth',
       'OtherP', 'AllPct'],
      dtype='object')

In [8]:
abdd = pd.DataFrame.from_csv('LIWC/aubonheurdesdames.txt', sep='\t')
abdd[['affect', 'Èmopos', 'ÈmonÈg']]


Out[8]:
affect Èmopos ÈmonÈg
Filename
Bourdoncle.txt 9.62 5.09 4.04
Denise.txt 10.27 5.99 3.88
Hutin.txt 8.35 4.46 3.38
MmeDesforges.txt 9.66 6.55 3.19
Mouret.txt 11.18 5.81 4.87
Untitled.txt 2.01 0.60 1.20

In [13]:
abdd = pd.DataFrame.from_csv('LIWC-raw/aubonheurdesdames.txt', sep='\t')
abdd[['affect', 'Èmopos', 'ÈmonÈg']]


Out[13]:
affect Èmopos ÈmonÈg
Filename
Bourdoncle.txt 5.31 2.84 2.20
Denise.txt 5.47 3.19 2.06
Hutin.txt 4.70 2.55 1.87
MmeDesforges.txt 5.23 3.53 1.74
Mouret.txt 6.02 3.13 2.61
Untitled.txt 2.01 0.60 1.20

In [18]:
mbov = pd.DataFrame.from_csv('LIWC/madamebovary.txt', sep='\t')
mbov[['affect', 'Èmopos', 'ÈmonÈg']]


Out[18]:
affect Èmopos ÈmonÈg
Filename
Bovary.txt 8.76 5.33 3.29
Charles.txt 8.88 4.65 3.84
Emma.txt 10.31 5.89 4.25
Léon.txt 9.21 5.85 3.43
Rodolphe.txt 9.32 5.33 3.61
Untitled.txt 2.01 0.60 1.20

In [14]:
mbov = pd.DataFrame.from_csv('LIWC-raw/madamebovary.txt', sep='\t')
mbov[['affect', 'Èmopos', 'ÈmonÈg']]


Out[14]:
affect Èmopos ÈmonÈg
Filename
Bovary.txt 4.74 2.84 1.76
Charles.txt 4.73 2.46 2.03
Emma.txt 5.55 3.16 2.28
Léon.txt 4.88 3.08 1.81
Rodolphe.txt 5.13 2.87 1.94
Untitled.txt 2.01 0.60 1.20

In [10]:
lass = pd.DataFrame.from_csv('LIWC/lassommoir.txt', sep='\t')
lass[['affect', 'Èmopos', 'ÈmonÈg']]


Out[10]:
affect Èmopos ÈmonÈg
Filename
Boche.txt 7.00 4.60 2.50
Coupeau.txt 8.34 4.43 3.82
Gervaise.txt 8.96 4.96 3.90
Lantier.txt 9.82 5.84 3.70
Lorilleux.txt 7.93 4.33 3.53
Untitled.txt 2.01 0.60 1.20

In [15]:
lass = pd.DataFrame.from_csv('LIWC-raw/lassommoir.txt', sep='\t')
lass[['affect', 'Èmopos', 'ÈmonÈg']]


Out[15]:
affect Èmopos ÈmonÈg
Filename
Boche.txt 3.66 2.39 1.30
Coupeau.txt 4.57 2.42 2.10
Gervaise.txt 4.88 2.68 2.13
Lantier.txt 5.18 3.10 1.94
Lorilleux.txt 4.32 2.36 1.92
Untitled.txt 2.01 0.60 1.20

In [11]:
ltdm = pd.DataFrame.from_csv('LIWC/letourdumondeen80jours.txt', sep='\t')
ltdm[['affect', 'Èmopos', 'ÈmonÈg']]


Out[11]:
affect Èmopos ÈmonÈg
Filename
Fix.txt 8.65 5.92 2.57
MrFogg.txt 8.62 5.83 2.63
MrsAouda.txt 9.00 6.47 2.60
Passepartout.txt 7.97 4.81 2.87
PhileasFogg.txt 7.38 4.89 2.46
Untitled.txt 2.01 0.60 1.20

In [16]:
ltdm = pd.DataFrame.from_csv('LIWC-raw/letourdumondeen80jours.txt', sep='\t')
ltdm[['affect', 'Èmopos', 'ÈmonÈg']]


Out[16]:
affect Èmopos ÈmonÈg
Filename
Fix.txt 4.68 3.22 1.38
MrFogg.txt 4.53 3.08 1.37
MrsAouda.txt 4.52 3.26 1.30
Passepartout.txt 4.53 2.74 1.61
PhileasFogg.txt 3.85 2.55 1.28
Untitled.txt 2.01 0.60 1.20

In [12]:
cand = pd.DataFrame.from_csv('LIWC/candide.txt', sep='\t')
cand[['affect', 'Èmopos', 'ÈmonÈg']]


Out[12]:
affect Èmopos ÈmonÈg
Filename
Cacambo.txt 8.58 6.07 2.34
Candide.txt 9.87 6.94 2.78
Cunégonde.txt 11.21 7.86 3.22
Martin.txt 9.34 6.45 2.81
Pangloss.txt 10.57 7.09 3.35
Untitled.txt 2.01 0.60 1.20

In [17]:
cand = pd.DataFrame.from_csv('LIWC-raw/candide.txt', sep='\t')
cand[['affect', 'Èmopos', 'ÈmonÈg']]


Out[17]:
affect Èmopos ÈmonÈg
Filename
Cacambo.txt 4.39 3.10 1.20
Candide.txt 5.30 3.73 1.49
Cunégonde.txt 5.87 4.12 1.69
Martin.txt 4.93 3.40 1.48
Pangloss.txt 5.81 3.90 1.84
Untitled.txt 2.01 0.60 1.20

In [ ]: