In [1]:
# Load libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from textblob import Word
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
FILEPATH = 'text_emotion.csv'

def load_unprocessed_data(filepath):
    df = pd.read_csv(filepath)
    return df

def cleanup_author_column_raw_data(df):
    df = df.drop('author', axis=1)
    return df

def cleanup_other_emotions(df):
    df = df.drop(df[df.sentiment == 'anger'].index)
    df = df.drop(df[df.sentiment == 'boredom'].index)
    df = df.drop(df[df.sentiment == 'enthusiasm'].index)
    df = df.drop(df[df.sentiment == 'empty'].index)
    df = df.drop(df[df.sentiment == 'fun'].index)
    df = df.drop(df[df.sentiment == 'relief'].index)
    df = df.drop(df[df.sentiment == 'surprise'].index)
    df = df.drop(df[df.sentiment == 'love'].index)
    df = df.drop(df[df.sentiment == 'hate'].index)
    df = df.drop(df[df.sentiment == 'neutral'].index)
    df = df.drop(df[df.sentiment == 'worry'].index)
    return df

In [3]:
def process_text_content(df):
    #Making all letters lowercase
    df['content'] = df['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

    #Removing Punctuation, Symbols
    df['content'] = df['content'].str.replace('[^\w\s]',' ')

    #Removing Stop Words using NLTK
    from nltk.corpus import stopwords
    stop = stopwords.words('english')
    df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

    #Lemmatisation
    from textblob import Word
    df['content'] = df['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

    #Correcting Letter Repetitions
    import re
    def de_repeat(text):
        pattern = re.compile(r"(.)\1{2,}")
        return pattern.sub(r"\1\1", text)

    #%%
    df['content'] = df['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split())) 
    return df

In [4]:
def cleanup_rare_words(df):
    # Code to find the top 10,000 rarest words (modify according to your dataset) 
    # appearing in the data
    freq = pd.Series(' '.join(df['content']).split()).value_counts()[-10000:]

    # Removing all those rarely appearing words from the data
    freq = list(freq.index)
    df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    return df

In [5]:
def encoding_emotions(df):
    #Encoding output labels 'sadness' as '1' & 'happiness' as '0'
    from sklearn import preprocessing
    lbl_enc = preprocessing.LabelEncoder()
    y = lbl_enc.fit_transform(df.sentiment.values)
    return y

In [6]:
def split_train_test_data(df, y):
    # Splitting into training and testing data in 90:10 ratio
    from sklearn.model_selection import train_test_split
    X_train, X_val, y_train, y_val = train_test_split(df.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
    return X_train, X_val, y_train, y_val

In [7]:
def extract_tfidf_parameters(X_train, X_val):
    # Extracting TF-IDF parameters
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_val_tfidf = tfidf.fit_transform(X_val)
    return tfidf, X_train_tfidf, X_val_tfidf

In [8]:
def extract_countvectors_parameters(df, X_train, X_val):
    # Extracting Count Vectors Parameters
    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer(analyzer='word')
    count_vect.fit(df['content'])
    X_train_count =  count_vect.transform(X_train)
    X_val_count =  count_vect.transform(X_val)
    return count_vect, X_train_count, X_val_count

In [9]:
def train_vanilla_models(X_train, y_train, X_val, y_val, variable_type):
    from sklearn.metrics import accuracy_score

    # Model 1: Multinomial Naive Bayes Classifier
    from sklearn.naive_bayes import MultinomialNB
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_val)
    print(f'naive bayes {variable_type} accuracy %s' % accuracy_score(y_pred, y_val))

    # Model 2: Linear SVM
    from sklearn.linear_model import SGDClassifier
    lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=0.01)
    lsvm.fit(X_train, y_train)
    y_pred = lsvm.predict(X_val)
    print(f'svm using {variable_type} accuracy %s' % accuracy_score(y_pred, y_val))

    # Model 3: logistic regression
    from sklearn.linear_model import LogisticRegression
    logreg = LogisticRegression(C=1, solver='lbfgs')
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_val)
    print(f'log reg {variable_type} accuracy %s' % accuracy_score(y_pred, y_val))

    # Model 4: Random Forest Classifier
    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=500)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    print(f'random forest {variable_type} accuracy %s' % accuracy_score(y_pred, y_val))
    return nb, lsvm, logreg, rf

In [10]:
def twitt_sentiment_prediction(model):
    #Below are 8 random statements.
    #The first 4 depict happiness
    #The last 4 depict sadness

    tweets = pd.DataFrame([
        'I am very happy today! The atmosphere looks cheerful',
        'Things are looking great. It was such a good day',
        'Success is right around the corner. Lets celebrate this victory',
        'Everything is more beautiful when you experience them with a smile!',
        'Now this is my worst, okay? But I am gonna get better.',
        'I am tired, boss. Tired of being on the road, lonely as a sparrow in the rain. I am tired of all the pain I feel',
        'This is quite depressing. I am filled with sorrow',
        'His death broke my heart. It was a sad day'
    ])

    # Doing some preprocessing on these tweets as done before
    tweets[0] = tweets[0].str.replace('[^\w\s]',' ')

    from nltk.corpus import stopwords
    stop = stopwords.words('english')
    tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

    from textblob import Word
    tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

    # Extracting Count Vectors feature from our tweets
    tweet_count = count_vect.transform(tweets[0])

    #Predicting the emotion of the tweet using our already trained linear SVM
    tweet_pred = model.predict(tweet_count)

    print(tweet_pred)

In [11]:
df = load_unprocessed_data(FILEPATH)
df = cleanup_author_column_raw_data(df)
df = cleanup_other_emotions(df)
df = process_text_content(df)
df = cleanup_rare_words(df)
y = encoding_emotions(df)

X_train, X_val, y_train, y_val = split_train_test_data(df, y)

tfidf, X_train_tfidf, X_val_tfidf = extract_tfidf_parameters(X_train, X_val)

count_vect, X_train_count, X_val_count = extract_countvectors_parameters(df, X_train, X_val)

train_vanilla_models(X_train_tfidf, y_train, X_val_tfidf, y_val, variable_type='TF-IDF')

nb_model, lsvm_model, logreg_model, rf_model = train_vanilla_models(X_train_count,
                                                                    y_train,
                                                                    X_val_count,
                                                                    y_val,
                                                                    variable_type='COUNT-VECTORS')
twitt_sentiment_prediction(lsvm_model)


naive bayes TF-IDF accuracy 0.4720616570327553
svm using TF-IDF accuracy 0.49229287090558765
log reg TF-IDF accuracy 0.5009633911368016
random forest TF-IDF accuracy 0.4884393063583815
naive bayes COUNT-VECTORS accuracy 0.7764932562620424
svm using COUNT-VECTORS accuracy 0.7928709055876686
log reg COUNT-VECTORS accuracy 0.7842003853564548
random forest COUNT-VECTORS accuracy 0.7543352601156069
[0 0 0 0 1 1 1 1]