In [1]:
# Load libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from textblob import Word
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
In [2]:
FILEPATH = 'text_emotion.csv'
def load_unprocessed_data(filepath):
df = pd.read_csv(filepath)
return df
def cleanup_author_column_raw_data(df):
df = df.drop('author', axis=1)
return df
def cleanup_other_emotions(df):
df = df.drop(df[df.sentiment == 'anger'].index)
df = df.drop(df[df.sentiment == 'boredom'].index)
df = df.drop(df[df.sentiment == 'enthusiasm'].index)
df = df.drop(df[df.sentiment == 'empty'].index)
df = df.drop(df[df.sentiment == 'fun'].index)
df = df.drop(df[df.sentiment == 'relief'].index)
df = df.drop(df[df.sentiment == 'surprise'].index)
df = df.drop(df[df.sentiment == 'love'].index)
df = df.drop(df[df.sentiment == 'hate'].index)
df = df.drop(df[df.sentiment == 'neutral'].index)
df = df.drop(df[df.sentiment == 'worry'].index)
return df
In [3]:
def process_text_content(df):
#Making all letters lowercase
df['content'] = df['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))
#Removing Punctuation, Symbols
df['content'] = df['content'].str.replace('[^\w\s]',' ')
#Removing Stop Words using NLTK
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
#Lemmatisation
from textblob import Word
df['content'] = df['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
#Correcting Letter Repetitions
import re
def de_repeat(text):
pattern = re.compile(r"(.)\1{2,}")
return pattern.sub(r"\1\1", text)
#%%
df['content'] = df['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))
return df
In [4]:
def cleanup_rare_words(df):
# Code to find the top 10,000 rarest words (modify according to your dataset)
# appearing in the data
freq = pd.Series(' '.join(df['content']).split()).value_counts()[-10000:]
# Removing all those rarely appearing words from the data
freq = list(freq.index)
df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
return df
In [5]:
def encoding_emotions(df):
#Encoding output labels 'sadness' as '1' & 'happiness' as '0'
from sklearn import preprocessing
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(df.sentiment.values)
return y
In [6]:
def split_train_test_data(df, y):
# Splitting into training and testing data in 90:10 ratio
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
return X_train, X_val, y_train, y_val
In [7]:
def extract_tfidf_parameters(X_train, X_val):
# Extracting TF-IDF parameters
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_val)
return tfidf, X_train_tfidf, X_val_tfidf
In [8]:
def extract_countvectors_parameters(df, X_train, X_val):
# Extracting Count Vectors Parameters
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(df['content'])
X_train_count = count_vect.transform(X_train)
X_val_count = count_vect.transform(X_val)
return count_vect, X_train_count, X_val_count
In [9]:
def train_vanilla_models(X_train, y_train, X_val, y_val, variable_type):
from sklearn.metrics import accuracy_score
# Model 1: Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_val)
print(f'naive bayes {variable_type} accuracy %s' % accuracy_score(y_pred, y_val))
# Model 2: Linear SVM
from sklearn.linear_model import SGDClassifier
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=0.01)
lsvm.fit(X_train, y_train)
y_pred = lsvm.predict(X_val)
print(f'svm using {variable_type} accuracy %s' % accuracy_score(y_pred, y_val))
# Model 3: logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1, solver='lbfgs')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)
print(f'log reg {variable_type} accuracy %s' % accuracy_score(y_pred, y_val))
# Model 4: Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
print(f'random forest {variable_type} accuracy %s' % accuracy_score(y_pred, y_val))
return nb, lsvm, logreg, rf
In [10]:
def twitt_sentiment_prediction(model):
#Below are 8 random statements.
#The first 4 depict happiness
#The last 4 depict sadness
tweets = pd.DataFrame([
'I am very happy today! The atmosphere looks cheerful',
'Things are looking great. It was such a good day',
'Success is right around the corner. Lets celebrate this victory',
'Everything is more beautiful when you experience them with a smile!',
'Now this is my worst, okay? But I am gonna get better.',
'I am tired, boss. Tired of being on the road, lonely as a sparrow in the rain. I am tired of all the pain I feel',
'This is quite depressing. I am filled with sorrow',
'His death broke my heart. It was a sad day'
])
# Doing some preprocessing on these tweets as done before
tweets[0] = tweets[0].str.replace('[^\w\s]',' ')
from nltk.corpus import stopwords
stop = stopwords.words('english')
tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
from textblob import Word
tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
# Extracting Count Vectors feature from our tweets
tweet_count = count_vect.transform(tweets[0])
#Predicting the emotion of the tweet using our already trained linear SVM
tweet_pred = model.predict(tweet_count)
print(tweet_pred)
In [11]:
df = load_unprocessed_data(FILEPATH)
df = cleanup_author_column_raw_data(df)
df = cleanup_other_emotions(df)
df = process_text_content(df)
df = cleanup_rare_words(df)
y = encoding_emotions(df)
X_train, X_val, y_train, y_val = split_train_test_data(df, y)
tfidf, X_train_tfidf, X_val_tfidf = extract_tfidf_parameters(X_train, X_val)
count_vect, X_train_count, X_val_count = extract_countvectors_parameters(df, X_train, X_val)
train_vanilla_models(X_train_tfidf, y_train, X_val_tfidf, y_val, variable_type='TF-IDF')
nb_model, lsvm_model, logreg_model, rf_model = train_vanilla_models(X_train_count,
y_train,
X_val_count,
y_val,
variable_type='COUNT-VECTORS')
twitt_sentiment_prediction(lsvm_model)