In [103]:
import pandas as pd
import json, os, re
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
with open('../data/text_category_stats/episodeLevel.json') as sentiment_file:
sentiment_data = json.load(sentiment_file)
In [6]:
episode_names = sorted([e['name'] for e in sentiment_data])
print(episode_names)
In [113]:
episode_numbers = dict(zip(episode_names,
list(map(lambda x: ((int(re.findall('(?<=S)[0-9]', x)[0]) - 1 )*10 +
int(re.findall('(?<=E)[0-9]+', x)[0])),
episode_names
))
))
print(episode_numbers)
In [8]:
category_names = sorted([k for k in sentiment_data[0].keys() if k != 'name'])
print(category_names)
In [114]:
sentiment_data = sorted(sentiment_data, key=lambda x: episode_numbers[x['name']])
In [115]:
sentiment_series = {c : [sum(e[c].values()) for e in sentiment_data] for c in category_names}
In [16]:
# TODO: normalization! but then we need full vocab for each episode...
In [117]:
dummy_dates = range(1, len(episode_names)+1)
cols = 3
rows = int(len(category_names) / cols) + 1
plt.style.use('ggplot')
fig = plt.figure(figsize=(5*cols,3*rows))
print('%d x %d'%(rows, cols))
for i, c in enumerate(category_names):
ax = fig.add_subplot(rows, cols, i+1)
ax.plot(dummy_dates, sentiment_series[c])
ax.set_title('%s over all episodes'%(c))
ax.set_ylabel('Raw frequency')
ax.set_xlabel('Episode number')
plt.tight_layout()
plt.show()
General trends:
In [ ]:
interesting_categories =
TODO: visualize normalized series to account for especially verbose episodes.
In [ ]: