In [1]:
%matplotlib notebook
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})
import pickle
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from emojibot.utils.text_utils import clean_sentence, encode_sentence
In [2]:
data_file = 'text_emotion.csv'
data = pd.read_csv(data_file)
In [3]:
data.describe(include='all')
Out[3]:
In [4]:
data.isnull().sum()
Out[4]:
In [5]:
data['sentiment'].value_counts().plot(kind='bar', rot=60)
Out[5]:
In [6]:
data.head()
Out[6]:
In [7]:
# clean content (remove handles, links, punctuation, stop words, and apply stemmer)
data['clean_content'] = data['content'].apply(clean_sentence)
data.head()
Out[7]:
In [9]:
# look at distribution of most common words
counter = Counter()
for _, row in data.iterrows():
counter.update(row['clean_content'].split())
counts = pd.DataFrame(counter.most_common(), columns=['word', 'count'])
counts['cumulative'] = counts['count'].cumsum() / counts['count'].sum()
counts.plot(y='cumulative', logx=True, grid=True)
plt.show()
In [10]:
# make vocabulary lookup from top 95% of most common words
cutoff = .95
vocab = dict()
for idx, row in counts.iterrows():
if row['cumulative'] > cutoff:
break
vocab[row['word']] = idx
# check vocab length
len(vocab)
Out[10]:
In [11]:
# encode words to ids
data['encoded_content'] = data['clean_content'].apply(encode_sentence, args=(vocab,))
data.head()
Out[11]:
In [12]:
# find max sequence length
max(data['encoded_content'].apply(len))
max_sequence_length = 25
In [13]:
# encode label values
labels = {value: idx for idx, value in enumerate(data['sentiment'].unique())}
labels
Out[13]:
In [14]:
data['label'] = data['sentiment'].apply(lambda x: labels[x])
data.head()
Out[14]:
In [15]:
# save processed file
processed_datafile = 'processed_emotions.pkl'
processed_data = data[['label', 'encoded_content']]
processed_data.columns = ['label', 'sequence']
processed_data.to_pickle(processed_datafile)
processed_data.head()
Out[15]:
In [105]:
# save data params
data_params_file = 'data_params.pkl'
class_lookup = {v: k for k, v in labels.items()}
params = dict(max_sequence_length=max_sequence_length, vocab=vocab, labels=class_lookup)
with open(data_params_file, 'wb') as f:
pickle.dump(params, f)
In [ ]: