In [1]:
%matplotlib notebook
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

import pickle

import matplotlib.pyplot as plt
import pandas as pd

from collections import Counter

from emojibot.utils.text_utils import clean_sentence, encode_sentence

In [2]:
data_file = 'text_emotion.csv'
data = pd.read_csv(data_file)

In [3]:
data.describe(include='all')


Out[3]:
tweet_id sentiment author content
count 4.000000e+04 40000 40000 40000
unique NaN 13 33871 39827
top NaN neutral MissxMarisa I just received a mothers day card from my lov...
freq NaN 8638 23 14
mean 1.845184e+09 NaN NaN NaN
std 1.188579e+08 NaN NaN NaN
min 1.693956e+09 NaN NaN NaN
25% 1.751431e+09 NaN NaN NaN
50% 1.855443e+09 NaN NaN NaN
75% 1.962781e+09 NaN NaN NaN
max 1.966441e+09 NaN NaN NaN

In [4]:
data.isnull().sum()


Out[4]:
tweet_id     0
sentiment    0
author       0
content      0
dtype: int64

In [5]:
data['sentiment'].value_counts().plot(kind='bar', rot=60)


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x110537d90>

In [6]:
data.head()


Out[6]:
tweet_id sentiment author content
0 1956967341 empty xoshayzers @tiffanylue i know i was listenin to bad habi...
1 1956967666 sadness wannamama Layin n bed with a headache ughhhh...waitin o...
2 1956967696 sadness coolfunky Funeral ceremony...gloomy friday...
3 1956967789 enthusiasm czareaquino wants to hang out with friends SOON!
4 1956968416 neutral xkilljoyx @dannycastillo We want to trade with someone w...

In [7]:
# clean content (remove handles, links, punctuation, stop words, and apply stemmer)
data['clean_content'] = data['content'].apply(clean_sentence)
data.head()


Out[7]:
tweet_id sentiment author content clean_content
0 1956967341 empty xoshayzers @tiffanylue i know i was listenin to bad habi... know listenin bad habit ear start freakin part
1 1956967666 sadness wannamama Layin n bed with a headache ughhhh...waitin o... layin bed headach ughhh waitin cal
2 1956967696 sadness coolfunky Funeral ceremony...gloomy friday... fun ceremony gloom friday
3 1956967789 enthusiasm czareaquino wants to hang out with friends SOON! want hang friend soon
4 1956968416 neutral xkilljoyx @dannycastillo We want to trade with someone w... want trad someon houston ticket on

In [9]:
# look at distribution of most common words
counter = Counter()
for _, row in data.iterrows():
    counter.update(row['clean_content'].split())
counts = pd.DataFrame(counter.most_common(), columns=['word', 'count'])
counts['cumulative'] = counts['count'].cumsum() / counts['count'].sum()

counts.plot(y='cumulative', logx=True, grid=True)
plt.show()



In [10]:
# make vocabulary lookup from top 95% of most common words
cutoff = .95
vocab = dict()
for idx, row in counts.iterrows():
    if row['cumulative'] > cutoff:
        break
    vocab[row['word']] = idx

# check vocab length
len(vocab)


Out[10]:
8055

In [11]:
# encode words to ids
data['encoded_content'] = data['clean_content'].apply(encode_sentence, args=(vocab,))
data.head()


Out[11]:
tweet_id sentiment author content clean_content encoded_content
0 1956967341 empty xoshayzers @tiffanylue i know i was listenin to bad habi... know listenin bad habit ear start freakin part [19, 2486, 57, 2825, 113, 87, 977, 369]
1 1956967666 sadness wannamama Layin n bed with a headache ughhhh...waitin o... layin bed headach ughhh waitin cal [6326, 83, 300, 1324, 1843, 93]
2 1956967696 sadness coolfunky Funeral ceremony...gloomy friday... fun ceremony gloom friday [50, 3321, 1306, 147]
3 1956967789 enthusiasm czareaquino wants to hang out with friends SOON! want hang friend soon [21, 354, 63, 102]
4 1956968416 neutral xkilljoyx @dannycastillo We want to trade with someone w... want trad someon houston ticket on [21, 1471, 172, 1976, 386, 16]

In [12]:
# find max sequence length
max(data['encoded_content'].apply(len))
max_sequence_length = 25

In [13]:
# encode label values
labels = {value: idx for idx, value in enumerate(data['sentiment'].unique())}
labels


Out[13]:
{'anger': 12,
 'boredom': 10,
 'empty': 0,
 'enthusiasm': 2,
 'fun': 7,
 'happiness': 9,
 'hate': 8,
 'love': 6,
 'neutral': 3,
 'relief': 11,
 'sadness': 1,
 'surprise': 5,
 'worry': 4}

In [14]:
data['label'] = data['sentiment'].apply(lambda x: labels[x])
data.head()


Out[14]:
tweet_id sentiment author content clean_content encoded_content label
0 1956967341 empty xoshayzers @tiffanylue i know i was listenin to bad habi... know listenin bad habit ear start freakin part [19, 2486, 57, 2825, 113, 87, 977, 369] 0
1 1956967666 sadness wannamama Layin n bed with a headache ughhhh...waitin o... layin bed headach ughhh waitin cal [6326, 83, 300, 1324, 1843, 93] 1
2 1956967696 sadness coolfunky Funeral ceremony...gloomy friday... fun ceremony gloom friday [50, 3321, 1306, 147] 1
3 1956967789 enthusiasm czareaquino wants to hang out with friends SOON! want hang friend soon [21, 354, 63, 102] 2
4 1956968416 neutral xkilljoyx @dannycastillo We want to trade with someone w... want trad someon houston ticket on [21, 1471, 172, 1976, 386, 16] 3

In [15]:
# save processed file
processed_datafile = 'processed_emotions.pkl'

processed_data = data[['label', 'encoded_content']]
processed_data.columns = ['label', 'sequence']
processed_data.to_pickle(processed_datafile)
processed_data.head()


Out[15]:
label sequence
0 0 [19, 2486, 57, 2825, 113, 87, 977, 369]
1 1 [6326, 83, 300, 1324, 1843, 93]
2 1 [50, 3321, 1306, 147]
3 2 [21, 354, 63, 102]
4 3 [21, 1471, 172, 1976, 386, 16]

In [105]:
# save data params
data_params_file = 'data_params.pkl'

class_lookup = {v: k for k, v in labels.items()}
params = dict(max_sequence_length=max_sequence_length, vocab=vocab, labels=class_lookup)

with open(data_params_file, 'wb') as f:
    pickle.dump(params, f)

In [ ]: