In [1]:
import pensieve as pv
import pandas as pd
pd.options.display.max_rows = 6
import numpy as np
import re
from tqdm import tqdm_notebook as tqdm
%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
In [29]:
book = 'book7'
doc = pv.Doc('../../clusterpot/%s.txt' %book)
print("title: %s" %doc.paragraphs[0].text)
The NRC emotional lexicon is a curated list of words with
Details of this work may be found at Saif Mohammad's homepage
The HDF5 file is cleaned from the original NRC text file.
The resulting pandas dataframe has the following columns
In [30]:
df = pd.read_hdf('./NRC.h5')
df
Out[30]:
p
in paragraphs in the text.split()
emo_d
w
) in paragraph (p
)e
)emo_d
, then add 1 to the specified emotion.
In [31]:
emo_list = []
moo_d = dict()
for p in tqdm(range(len(doc.paragraphs)), desc='par'):
s = doc.paragraphs[p].text.split()
emo_d = dict({'joy':0, 'fear':0, 'surprise':0, 'sadness':0, 'disgust':0, 'anger':0})
moo_d[p] = []
for w in s:
w = re.sub("[!@#$%^&*()_+:;,.\\?']", "", w)
emo = df.query("word=='%s'" %w).query("binary==1")['emo'].as_matrix()
for e in emo:
if e in emo_d.keys():
emo_d['%s' %e] += 1
moo_d[p].append(w)
else:
pass
moo_d[p] = np.unique(moo_d[p]).tolist()
emo_list.append(emo_d)
In [32]:
pd.DataFrame(emo_list).to_hdf('./book_emo.h5',book)
pd.DataFrame.from_dict(moo_d, orient='index').to_hdf('./book_moo.h5', book)
In [12]:
rbook = 'book2'
doc = pv.Doc('../../clusterpot/%s.txt' %rbook)
book_emotions = pd.read_hdf('book_emo.h5',key=rbook)
book_mood = pd.read_hdf('book_moo.h5',key=rbook)
book_emotions.join(book_mood)
Out[12]:
In [44]:
emo_kw = 'anger'
emo_rank = np.argsort(book_emotions[emo_kw].as_matrix())[::-1]
emo_rank
Out[44]:
In [48]:
pidx = 1
print("paragraph from text\n%s\n" %doc.paragraphs[emo_rank[pidx]].text)
print("moo_d emo Harry feels\n%s\n" %book_mood.iloc[emo_rank[pidx]].dropna())
print("emo vector\n%s" %book_emotions.iloc[emo_rank[pidx]])
In [29]:
rescale = True
if rescale:
slength = [float(len(doc.paragraphs[i].text.split())) for i in range(len(doc.paragraphs))]
else:
slength = 1
x = book_emotions['anger'].as_matrix()/slength
y = book_emotions['disgust'].as_matrix()/slength
z = book_emotions['sadness'].as_matrix()/slength
fig = plt.figure('emotional correlation',figsize=(7,7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=x,ys=y,zs=z)
ax.set_xlabel('anger')
ax.set_ylabel('disgust')
ax.set_zlabel('sadness')
plt.draw()
plt.show()
In [123]:
slength = [float(len(doc.paragraphs[i].text.split())) for i in range(len(doc.paragraphs))]
y = np.sum(book_emotions.as_matrix(),axis=1)
plt.figure('paragraph length vs. emotion sum',figsize=(7,7))
ax = plt.axes([0.15,0.15,0.8,0.8])
ax.scatter(x=slength,y=y)
#ax.set_xlim([0,1])
#ax.set_ylim([0,1])
#ax.set_xscale('log')
plt.draw()
plt.show()
In [126]:
slength = [float(len(doc.paragraphs[i].text.split())) for i in range(len(doc.paragraphs))]
y = np.sum(book_emotions.as_matrix(),axis=1)
plt.figure('paragraph length vs. emotion sum',figsize=(7,7))
ax = plt.axes([0.15,0.15,0.8,0.8])
ax.scatter(x=slength,y=y)
#ax.set_xlim([0,1])
#ax.set_ylim([0,1])
#ax.set_xscale('log')
plt.draw()
plt.show()
In [34]:
s = "Upon the signature of the International Statute of Secrecy in 1689, wizards went into hiding for good. It was natural, perhaps, that they formed their own small communities within a community. Many small villages and hamlets attracted several magical families, who banded together for mutual support and protection. The villages of Tinworsh in Cornwall, Upper Flagley in Yorkshire, and Ottery St. Catchpole on the south coast of England were notable homes to knots of Wizarding families who lived alongside tolerant and sometimes Confunded Muggles. Most celebrated of these half-magical dwelling places is, perhaps, Godrics Hollow, the West Country village where the great wizard Godric Gryffindor was born, and where Bowman Wright, Wizarding smith, forged the first Golden Snitch. The graveyard is full of the names of ancient magical families, and this accounts, no doubt, for the stories of hauntings that have dogged the little church beside it for many centuries."
w = s.split()[0]
%timeit re.sub("[!@#$%^&*()_+:;,.\\?']", "", w)
In [33]:
%%timeit
punctuation = ["!","@","#","$","%","^","&","*","(",")","_","+",":",";",",",".","?","'","\\"]
for punc in punctuation:
w.replace("%s" %punc,'')
In [31]:
moodf = pd.DataFrame.from_dict(moo_d, orient='index')
In [38]:
moodf.iloc[25].dropna()
Out[38]:
In [26]:
moodf.to_hdf('./book_moo_v2.h5', book)
In [39]:
book_mood = pd.read_hdf('book_moo_v2.h5',key=rbook)
In [40]:
Out[40]:
In [ ]: