In [1]:

    
%matplotlib notebook
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

import pickle

import matplotlib.pyplot as plt
import pandas as pd

from collections import Counter

from emojibot.utils.text_utils import clean_sentence, encode_sentence



In [2]:

    
data_file = 'text_emotion.csv'
data = pd.read_csv(data_file)



In [3]:

    
data.describe(include='all')









    Out[3]:






  
    
      
      tweet_id
      sentiment
      author
      content
    
  
  
    
      count
      4.000000e+04
      40000
      40000
      40000
    
    
      unique
      NaN
      13
      33871
      39827
    
    
      top
      NaN
      neutral
      MissxMarisa
      I just received a mothers day card from my lov...
    
    
      freq
      NaN
      8638
      23
      14
    
    
      mean
      1.845184e+09
      NaN
      NaN
      NaN
    
    
      std
      1.188579e+08
      NaN
      NaN
      NaN
    
    
      min
      1.693956e+09
      NaN
      NaN
      NaN
    
    
      25%
      1.751431e+09
      NaN
      NaN
      NaN
    
    
      50%
      1.855443e+09
      NaN
      NaN
      NaN
    
    
      75%
      1.962781e+09
      NaN
      NaN
      NaN
    
    
      max
      1.966441e+09
      NaN
      NaN
      NaN



In [4]:

    
data.isnull().sum()









    Out[4]:





tweet_id     0
sentiment    0
author       0
content      0
dtype: int64



In [5]:

    
data['sentiment'].value_counts().plot(kind='bar', rot=60)









    














    











    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x110537d90>



In [6]:

    
data.head()









    Out[6]:






  
    
      
      tweet_id
      sentiment
      author
      content
    
  
  
    
      0
      1956967341
      empty
      xoshayzers
      @tiffanylue i know  i was listenin to bad habi...
    
    
      1
      1956967666
      sadness
      wannamama
      Layin n bed with a headache  ughhhh...waitin o...
    
    
      2
      1956967696
      sadness
      coolfunky
      Funeral ceremony...gloomy friday...
    
    
      3
      1956967789
      enthusiasm
      czareaquino
      wants to hang out with friends SOON!
    
    
      4
      1956968416
      neutral
      xkilljoyx
      @dannycastillo We want to trade with someone w...



In [7]:

    
# clean content (remove handles, links, punctuation, stop words, and apply stemmer)
data['clean_content'] = data['content'].apply(clean_sentence)
data.head()









    Out[7]:






  
    
      
      tweet_id
      sentiment
      author
      content
      clean_content
    
  
  
    
      0
      1956967341
      empty
      xoshayzers
      @tiffanylue i know  i was listenin to bad habi...
      know listenin bad habit ear start freakin part
    
    
      1
      1956967666
      sadness
      wannamama
      Layin n bed with a headache  ughhhh...waitin o...
      layin bed headach ughhh  waitin cal
    
    
      2
      1956967696
      sadness
      coolfunky
      Funeral ceremony...gloomy friday...
      fun ceremony  gloom friday
    
    
      3
      1956967789
      enthusiasm
      czareaquino
      wants to hang out with friends SOON!
      want hang friend soon
    
    
      4
      1956968416
      neutral
      xkilljoyx
      @dannycastillo We want to trade with someone w...
      want trad someon houston ticket on



In [9]:

    
# look at distribution of most common words
counter = Counter()
for _, row in data.iterrows():
    counter.update(row['clean_content'].split())
counts = pd.DataFrame(counter.most_common(), columns=['word', 'count'])
counts['cumulative'] = counts['count'].cumsum() / counts['count'].sum()

counts.plot(y='cumulative', logx=True, grid=True)
plt.show()



In [10]:

    
# make vocabulary lookup from top 95% of most common words
cutoff = .95
vocab = dict()
for idx, row in counts.iterrows():
    if row['cumulative'] > cutoff:
        break
    vocab[row['word']] = idx

# check vocab length
len(vocab)









    Out[10]:





8055



In [11]:

    
# encode words to ids
data['encoded_content'] = data['clean_content'].apply(encode_sentence, args=(vocab,))
data.head()









    Out[11]:






  
    
      
      tweet_id
      sentiment
      author
      content
      clean_content
      encoded_content
    
  
  
    
      0
      1956967341
      empty
      xoshayzers
      @tiffanylue i know  i was listenin to bad habi...
      know listenin bad habit ear start freakin part
      [19, 2486, 57, 2825, 113, 87, 977, 369]
    
    
      1
      1956967666
      sadness
      wannamama
      Layin n bed with a headache  ughhhh...waitin o...
      layin bed headach ughhh  waitin cal
      [6326, 83, 300, 1324, 1843, 93]
    
    
      2
      1956967696
      sadness
      coolfunky
      Funeral ceremony...gloomy friday...
      fun ceremony  gloom friday
      [50, 3321, 1306, 147]
    
    
      3
      1956967789
      enthusiasm
      czareaquino
      wants to hang out with friends SOON!
      want hang friend soon
      [21, 354, 63, 102]
    
    
      4
      1956968416
      neutral
      xkilljoyx
      @dannycastillo We want to trade with someone w...
      want trad someon houston ticket on
      [21, 1471, 172, 1976, 386, 16]



In [12]:

    
# find max sequence length
max(data['encoded_content'].apply(len))
max_sequence_length = 25



In [13]:

    
# encode label values
labels = {value: idx for idx, value in enumerate(data['sentiment'].unique())}
labels









    Out[13]:





{'anger': 12,
 'boredom': 10,
 'empty': 0,
 'enthusiasm': 2,
 'fun': 7,
 'happiness': 9,
 'hate': 8,
 'love': 6,
 'neutral': 3,
 'relief': 11,
 'sadness': 1,
 'surprise': 5,
 'worry': 4}



In [14]:

    
data['label'] = data['sentiment'].apply(lambda x: labels[x])
data.head()









    Out[14]:






  
    
      
      tweet_id
      sentiment
      author
      content
      clean_content
      encoded_content
      label
    
  
  
    
      0
      1956967341
      empty
      xoshayzers
      @tiffanylue i know  i was listenin to bad habi...
      know listenin bad habit ear start freakin part
      [19, 2486, 57, 2825, 113, 87, 977, 369]
      0
    
    
      1
      1956967666
      sadness
      wannamama
      Layin n bed with a headache  ughhhh...waitin o...
      layin bed headach ughhh  waitin cal
      [6326, 83, 300, 1324, 1843, 93]
      1
    
    
      2
      1956967696
      sadness
      coolfunky
      Funeral ceremony...gloomy friday...
      fun ceremony  gloom friday
      [50, 3321, 1306, 147]
      1
    
    
      3
      1956967789
      enthusiasm
      czareaquino
      wants to hang out with friends SOON!
      want hang friend soon
      [21, 354, 63, 102]
      2
    
    
      4
      1956968416
      neutral
      xkilljoyx
      @dannycastillo We want to trade with someone w...
      want trad someon houston ticket on
      [21, 1471, 172, 1976, 386, 16]
      3



In [15]:

    
# save processed file
processed_datafile = 'processed_emotions.pkl'

processed_data = data[['label', 'encoded_content']]
processed_data.columns = ['label', 'sequence']
processed_data.to_pickle(processed_datafile)
processed_data.head()









    Out[15]:






  
    
      
      label
      sequence
    
  
  
    
      0
      0
      [19, 2486, 57, 2825, 113, 87, 977, 369]
    
    
      1
      1
      [6326, 83, 300, 1324, 1843, 93]
    
    
      2
      1
      [50, 3321, 1306, 147]
    
    
      3
      2
      [21, 354, 63, 102]
    
    
      4
      3
      [21, 1471, 172, 1976, 386, 16]



In [105]:

    
# save data params
data_params_file = 'data_params.pkl'

class_lookup = {v: k for k, v in labels.items()}
params = dict(max_sequence_length=max_sequence_length, vocab=vocab, labels=class_lookup)

with open(data_params_file, 'wb') as f:
    pickle.dump(params, f)



In [ ]:

	tweet_id	sentiment	author	content
count	4.000000e+04	40000	40000	40000
unique	NaN	13	33871	39827
top	NaN	neutral	MissxMarisa	I just received a mothers day card from my lov...
freq	NaN	8638	23	14
mean	1.845184e+09	NaN	NaN	NaN
std	1.188579e+08	NaN	NaN	NaN
min	1.693956e+09	NaN	NaN	NaN
25%	1.751431e+09	NaN	NaN	NaN
50%	1.855443e+09	NaN	NaN	NaN
75%	1.962781e+09	NaN	NaN	NaN
max	1.966441e+09	NaN	NaN	NaN

	tweet_id	sentiment	author	content
0	1956967341	empty	xoshayzers	@tiffanylue i know i was listenin to bad habi...
1	1956967666	sadness	wannamama	Layin n bed with a headache ughhhh...waitin o...
2	1956967696	sadness	coolfunky	Funeral ceremony...gloomy friday...
3	1956967789	enthusiasm	czareaquino	wants to hang out with friends SOON!
4	1956968416	neutral	xkilljoyx	@dannycastillo We want to trade with someone w...

	label	sequence
0	0	[19, 2486, 57, 2825, 113, 87, 977, 369]
1	1	[6326, 83, 300, 1324, 1843, 93]
2	1	[50, 3321, 1306, 147]
3	2	[21, 354, 63, 102]
4	3	[21, 1471, 172, 1976, 386, 16]