In [1]:
# coding: utf-8
import pandas as pd
from math import log
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
%matplotlib inline
In [2]:
books = ['aubonheurdesdames', 'candide', 'lassommoir', 'letourdumondeen80jours', 'madamebovary']
gender_names = ['gender_nosolo', 'gender_solo', 'gender_nosolo_w', 'gender_solo_w']
sentiment_names = ['sentiment_nosolo', 'sentiment_solo']
job_names = ['count_full_const', 'count_full_decr', 'count_expo_const', 'count_expo_decr',
'proximity_full_const', 'proximity_full_decr', 'proximity_expo_const', 'proximity_expo_decr',
'job_full_const', 'job_full_decr', 'job_expo_const', 'job_expo_decr']
frame_names = job_names + gender_names + sentiment_names
suffix = '.csv'
path = 'metadata/'
pred_col = ['count', 'proximity', 'job']
df_dict = {}
def load_frames():
for b in books:
for name in frame_names:
file = path + b + '_' + name + suffix
key = b + '_' + name
df_dict[key] = pd.read_csv(file)
df_dict[key].drop('Unnamed: 0', axis=1, inplace=True)
predictor = name.split('_')[0]
if predictor in pred_col:
df_dict[key]['Predictor'] = predictor
load_frames()
To save the gender name scores to csv files, execute below
In [11]:
for book in books:
df_dict[book + '_gender_nosolo'][['Character', 'Name_score']].to_csv('metadata/' + book+ '_char_name_scores.csv')
Plot data for job predictor
In [39]:
job_cols = df_dict['aubonheurdesdames_count_full_const'].columns
# sentiment_cols = df_dict['letourdumondeen80jours_sentiment_nosolo'].columns
In [40]:
palette = sns.color_palette()
sns.set_style('whitegrid')
In [41]:
def char_similarity(df, name='', character=''):
if name:
# Use mean of col
df = df_dict[name]
df = df.fillna(df.mean())
if character:
df = df[df.Character == character]
df = df.groupby(['Rank'])['Similarity'].mean().to_frame()
return df
In [42]:
df1 = pd.DataFrame(columns=job_cols)
df2 = pd.DataFrame(columns=job_cols)
df3 = pd.DataFrame(columns=job_cols)
plot_names = ['_count_full_decr', '_proximity_full_decr', '_job_full_decr']
for book in books:
df1 = df1.append(df_dict[book+plot_names[0]])
df2 = df2.append(df_dict[book+plot_names[1]])
df3 = df3.append(df_dict[book+plot_names[2]])
df1 = df1.fillna(df1.mean())
df2 = df2.fillna(df2.mean())
df3 = df3.fillna(df3.mean())
df = df1.append(df2)
df = df.append(df3)
fig, ax = plt.subplots(figsize=(12,8))
sns.pointplot(x='Rank', y="Similarity", hue="Predictor", linestyles='--', data=df, dodge=True)
ax.legend(fontsize=20)
plt.xlabel('Rank', fontsize=20)
plt.ylabel('Similarity', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18);
In [43]:
len(df.Character.unique())
Out[43]:
In [44]:
df1 = pd.DataFrame(columns=job_cols)
df2 = pd.DataFrame(columns=job_cols)
df3 = pd.DataFrame(columns=job_cols)
df4 = pd.DataFrame(columns=job_cols)
plot_names = ['_job_full_const','_job_full_decr', '_job_expo_const', '_job_expo_decr']
for book in books:
df1 = df1.append(df_dict[book+plot_names[0]])
df2 = df2.append(df_dict[book+plot_names[1]])
df3 = df3.append(df_dict[book+plot_names[2]])
df4 = df4.append(df_dict[book+plot_names[3]])
df1 = df1.fillna(df1.mean())
df2 = df2.fillna(df2.mean())
df3 = df3.fillna(df3.mean())
df4 = df4.fillna(df4.mean())
df1['Predictor'] = 'full-const'
df2['Predictor'] = 'full-decreasing'
df3['Predictor'] = 'exposition-const'
df4['Predictor'] = 'exposition-decreasing'
df = df1.append(df2)
df = df.append(df3)
df = df.append(df4)
fig, ax = plt.subplots(figsize=(12,8))
sns.pointplot(x='Rank', y="Similarity", hue="Predictor", data=df, linestyles='--', dodge=True)
ax.legend(fontsize=20)
plt.xlabel('Rank', fontsize=20)
plt.ylabel('Similarity', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18);
In [45]:
df1 = df1[df1.Rank < 5]
len(df1[df1.Similarity == 1.0].Character.unique())
Out[45]:
In [46]:
book = 'aubonheurdesdames'
name1 = book + '_count_full_const'
name2 = book + '_proximity_full_const'
plotter = df_dict[name1].append(df_dict[name2])
plotter = plotter[plotter.Rank < 3]
fig, ax = plt.subplots(figsize=(14,10))
sns.swarmplot(
x='Character',
y='Similarity',
hue='Predictor',
palette={'count': palette[0], 'proximity': palette[2]},
data=plotter,
ax=ax,
size=8)
plt.xticks(rotation=90)
plt.suptitle(book, fontsize=18)
ax.legend(
loc='best',
fontsize=14)
ax.tick_params(labelsize=14)
plt.xlabel('Character', fontsize=16)
plt.ylabel('Similarity score', fontsize=16)
plt.show()
In [47]:
fig, ax = plt.subplots(figsize=(15,10))
book = 'aubonheurdesdames'
name = book + '_count_full_const'
sns.swarmplot(
x='Character',
y='Similarity',
data=df_dict[name],
hue='Rank',
ax=ax)
ax.legend(
loc='best',
fontsize=14)
ax.tick_params(labelsize=14)
plt.xlabel('Character', fontsize=16)
plt.ylabel('Similarity score', fontsize=16)
plt.xticks(rotation=90)
plt.show()
In [48]:
load_frames()
In [49]:
gender_cols = df_dict['aubonheurdesdames_gender_nosolo'].columns
gender_cols
Out[49]:
In [50]:
to_drop = ['Character', 'Prediction', 'Score', 'Label']
In [51]:
df_gender = pd.DataFrame(columns=gender_cols)
for book in books:
name = 'gender_solo'
df_key = book + '_' + name
df_gender = df_gender.append(df_dict[df_key])
# reassign label
y = df_gender.Label
y = y.apply(lambda x: 1 if x == 'f' else 0 )
# drop columns
df_gender.drop(to_drop, axis=1, inplace=True)
print(df_gender.columns)
# scale features
scaler = StandardScaler()
df_gender = scaler.fit_transform(df_gender)
In [52]:
lr = LogisticRegression(max_iter=3000)
model = lr.fit(df_gender, y)
model.coef_
Out[52]:
In [53]:
sgd = SGDClassifier(loss='log', n_iter=1000)
model = sgd.fit(df_gender, y)
model.coef_
Out[53]:
In [54]:
df_joined_gend_preds = {}
for book in books:
df_joined_gend_preds[book] = df_dict[book + '_' + gender_names[0]].join(
df_dict[book + '_' + gender_names[1]], lsuffix='_nosolo', rsuffix='_solo')
df_append_gender = pd.DataFrame(columns=df_joined_gend_preds['candide'].columns)
for k, v in df_joined_gend_preds.items():
df_append_gender = df_append_gender.append(v)
y = df_append_gender.Label_nosolo
app_to_drop = list(map(lambda x: x + '_nosolo', to_drop))
app_to_drop.extend(list(map(lambda x: x + '_solo', to_drop)))
df_append_gender.drop(app_to_drop, axis=1, inplace=True)
y = y.apply(lambda x: 1 if x == 'f' else 0 )
In [55]:
df_append_gender.columns
Out[55]:
In [56]:
df_append_gender = scaler.fit_transform(df_append_gender)
In [57]:
model = lr.fit(df_append_gender, y)
model.coef_
Out[57]:
In [58]:
model = sgd.fit(df_append_gender, y)
model.coef_
Out[58]:
In [62]:
load_frames()
In [87]:
def class_metrics(name, print_=True):
df_gender = pd.DataFrame(columns=gender_cols)
for book in books:
df_key = book + '_' + name
df_gender = df_gender.append(df_dict[df_key])
y = df_gender[['Label', 'Prediction']]
print('full size', y.shape[0])
print('predicted size', df_gender.dropna().shape[0])
y_temp = y[y.Label == 'f']
tp = y_temp[y_temp.Prediction == 'f'].shape[0]
fn = y_temp[y_temp.Prediction == 'm'].shape[0]
y_temp = y[y.Label == 'm']
tn = y_temp[y_temp.Prediction == 'm'].shape[0]
fp = y_temp[y_temp.Prediction == 'f'].shape[0]
precision = tp / (tp + fp)
recall = tp / (tp + fn)
acc = (tp + tn) / (tp + tn + fp + fn)
fscore = 2 * (precision * recall) / (precision + recall)
if print_:
print('accuracy:{a} \nprecision:{p} \nrecall:{r} \nfscore:{f}'.format(
a=acc, p=precision, r=recall, f=fscore))
In [88]:
class_metrics('gender_nosolo')
In [69]:
class_metrics('gender_nosolo_w')
In [70]:
class_metrics('gender_solo')
In [71]:
class_metrics('gender_solo_w')
In [72]:
df_gender = pd.DataFrame(columns=gender_cols)
for book in books:
df_key = book + '_gender_nosolo_w'
df_gender = df_gender.append(df_dict[df_key])
y = df_gender[['Label', 'Prediction']]
y = y.dropna()
In [73]:
y.loc[y.Label == y.Prediction, 'Class'] = 'correct'
y.loc[y.Label != y.Prediction, 'Class'] = 'wrong'
In [74]:
true = y.Label
true = true.to_frame()
true.columns = ['Gender']
true['Label'] = 'True'
pred = y.Prediction
pred = pred.to_frame()
pred.columns = ['Gender']
pred['Label'] = 'Predicted'
In [75]:
plotter = true.append(pred)
plotter.columns
Out[75]:
In [80]:
fig, ax = plt.subplots(figsize=(12,10))
sns.countplot(x='Prediction', hue='Class', data=y, palette=[palette[0], palette[2]])
ax.legend(
loc='best',
fontsize=20)
ax.tick_params(labelsize=18)
plt.xlabel('Gender', fontsize=20)
plt.ylabel('Count', fontsize=20);
In [81]:
fig, ax = plt.subplots(figsize=(12,10))
sns.countplot(x='Gender', hue='Label', data=plotter)
ax.legend(
loc='best',
fontsize=20)
ax.tick_params(labelsize=18)
plt.xlabel('Gender', fontsize=20)
plt.ylabel('Count', fontsize=16);
In [89]:
sentiment_cols = df_dict['aubonheurdesdames_sentiment_nosolo'].columns
sentiment_cols
Out[89]:
In [90]:
sentiment_names
Out[90]:
In [91]:
stacked_sent = pd.DataFrame(columns=sentiment_cols)
name = sentiment_names[0]
for book in books:
key = book + '_' + name
stacked_sent = stacked_sent.append(df_dict[key])
In [4]:
def div_total(df):
df['Total_count'] = df['Pos_count'] + df['Neg_count'] + df['Neut_count']
temp = df[['Pos_count', 'Neg_count', 'Neut_count']].div(df.Total_count, axis=0)
df.Pos_count = temp.Pos_count
df.Neg_count = temp.Neg_count
df.Neut_count = temp.Neut_count
In [93]:
div_total(stacked_sent)
In [94]:
pos = stacked_sent.Pos_count.to_frame()
neg = stacked_sent.Neg_count.to_frame()
neut = stacked_sent.Neut_count.to_frame()
pos.columns = ['Count']
neg.columns = ['Count']
neut.columns = ['Count']
pos['Sentiment'] = 'Pos'
neg['Sentiment'] = 'Neg'
neut['Sentiment'] = 'Neut'
In [95]:
pos = pos.append(neg).append(neut)
In [117]:
ax, fig = plt.subplots(figsize=(12,10))
sns.barplot(x='Sentiment', y='Count', data=pos, palette=[palette[1], palette[2], palette[0]])
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.xlabel('Sentiment', fontsize=20)
plt.ylabel('mean(Count)', fontsize=20);
In [97]:
abdd_top = pd.read_csv('metadata/aubonheurdesdames_sentiment_nosolo_top.csv')
div_total(abdd_top)
In [98]:
abdd_top[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']]
Out[98]:
In [99]:
stacked_sent[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']].head(10)
Out[99]:
In [100]:
load_frames()
In [101]:
tdm_top = pd.read_csv('metadata/letourdumondeen80jours_sentiment_nosolo_top.csv')
div_total(tdm_top)
In [102]:
tdm_top[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']]
Out[102]:
In [103]:
tdm = df_dict['letourdumondeen80jours_sentiment_nosolo']
div_total(tdm)
tdm.head()
Out[103]:
In [19]:
lass_top = pd.read_csv('metadata/lassommoir_sentiment_nosolo_top.csv')
div_total(lass_top)
In [21]:
lass_top[['Character', 'Label', 'Pos_count', 'Neg_count', 'Neut_count', 'Total_count']]
Out[21]:
In [21]:
abdd.columns
Out[21]:
In [8]:
abdd = pd.DataFrame.from_csv('LIWC/aubonheurdesdames.txt', sep='\t')
abdd[['affect', 'Èmopos', 'ÈmonÈg']]
Out[8]:
In [13]:
abdd = pd.DataFrame.from_csv('LIWC-raw/aubonheurdesdames.txt', sep='\t')
abdd[['affect', 'Èmopos', 'ÈmonÈg']]
Out[13]:
In [18]:
mbov = pd.DataFrame.from_csv('LIWC/madamebovary.txt', sep='\t')
mbov[['affect', 'Èmopos', 'ÈmonÈg']]
Out[18]:
In [14]:
mbov = pd.DataFrame.from_csv('LIWC-raw/madamebovary.txt', sep='\t')
mbov[['affect', 'Èmopos', 'ÈmonÈg']]
Out[14]:
In [10]:
lass = pd.DataFrame.from_csv('LIWC/lassommoir.txt', sep='\t')
lass[['affect', 'Èmopos', 'ÈmonÈg']]
Out[10]:
In [15]:
lass = pd.DataFrame.from_csv('LIWC-raw/lassommoir.txt', sep='\t')
lass[['affect', 'Èmopos', 'ÈmonÈg']]
Out[15]:
In [11]:
ltdm = pd.DataFrame.from_csv('LIWC/letourdumondeen80jours.txt', sep='\t')
ltdm[['affect', 'Èmopos', 'ÈmonÈg']]
Out[11]:
In [16]:
ltdm = pd.DataFrame.from_csv('LIWC-raw/letourdumondeen80jours.txt', sep='\t')
ltdm[['affect', 'Èmopos', 'ÈmonÈg']]
Out[16]:
In [12]:
cand = pd.DataFrame.from_csv('LIWC/candide.txt', sep='\t')
cand[['affect', 'Èmopos', 'ÈmonÈg']]
Out[12]:
In [17]:
cand = pd.DataFrame.from_csv('LIWC-raw/candide.txt', sep='\t')
cand[['affect', 'Èmopos', 'ÈmonÈg']]
Out[17]:
In [ ]: