This notebook implements an English-language tweet sentiment classifier based on the approach of Go et al. The accuracy on the test data containing positive and negative sentiment tweets is 80%. Training and test data was downloaded here



In [ ]:

    
import cPickle

from IPython.core.magic import (register_line_magic, register_cell_magic,
                                register_line_cell_magic)
from IPython.display import display
from IPython.display import HTML
import pandas as pd
import pandas.io.sql as psql
import psycopg2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

import params



In [ ]:

    
# connect to database
conn = psycopg2.connect(database=params.database,
                        host=params.host,
                        port=params.port,
                        user=params.username,
                        password=params.password)

conn.autocommit = True



In [ ]:

    
# magic functions to aid interaction with PostgresSQL/GPDB/HAWQ
_df = None
@register_cell_magic
def showsql(line, cell):
    """
        Extract the code in the specific cell (should be valid SQL), and execute
        it using the connection object to the backend database. 
        The resulting Pandas dataframe
        is rendered inline below the cell using IPython.display.
        You'd use this for SELECT
    """
    #Use the global connection object defined above.
    global conn
    global _df
    _df = psql.read_sql(cell, conn)
    conn.commit()
    display(_df)
    return
    
@register_cell_magic
def execsql(line, cell):
    """
        Extract the code in the specific cell (should be valid SQL), and execute
        it using the connection object to the backend database. 
        You'd use this for CREATE/UPDATE/DELETE
    """
    #Use the global connection object defined above.
    global conn
    global _df
    _df = psql.execute(cell, conn)
    conn.commit()
    return

# We delete these to avoid name conflicts for automagic to work
del execsql, showsql

Build PL/Python function and model



In [ ]:

    
%%execsql

DROP FUNCTION IF EXISTS mdl.train_sentiment_model(tweets text[], polarities bigint[]);
CREATE FUNCTION mdl.train_sentiment_model(tweets text[], polarities bigint[])
RETURNS bytea AS $$
import cPickle
import re

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

def regex_preprocess(raw_tweets):
    pp_text = pd.Series(raw_tweets)
    
    user_pat = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)'
    http_pat = '(https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})'
    repeat_pat, repeat_repl = "(.)\\1\\1+",'\\1\\1'

    pp_text = pp_text.str.replace(pat = user_pat, repl = 'USERNAME')
    pp_text = pp_text.str.replace(pat = http_pat, repl = 'URL')
    pp_text.str.replace(pat = repeat_pat, repl = repeat_repl)
    return pp_text
    
sentiment_lr = Pipeline([('count_vect', CountVectorizer(min_df = 100,
                                                        ngram_range = (1,1),
                                                        stop_words = 'english')), 
                         ('lr', LogisticRegression())])

sentiment_lr.fit(regex_preprocess(tweets), polarities)
return cPickle.dumps(sentiment_lr)
$$ LANGUAGE plpythonu;

DROP TABLE IF EXISTS mdl.sentiment_model;
CREATE TABLE mdl.sentiment_model AS
SELECT mdl.train_sentiment_model(array_agg(text),array_agg(polarity)) model
FROM mdl.tweets_train;

Apply function to test set



In [ ]:

    
%%execsql
SELECT *
FROM mdl.sentiment_model;

DROP FUNCTION IF EXISTS mdl.apply_sentiment_model(model bytea, tweets text[]);
CREATE FUNCTION mdl.apply_sentiment_model(model bytea, tweets text[])
RETURNS float8[] AS $$
import cPickle
import re

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

def regex_preprocess(raw_tweets):
    pp_text = pd.Series(raw_tweets)
    
    user_pat = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)'
    http_pat = '(https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})'
    repeat_pat, repeat_repl = "(.)\\1\\1+",'\\1\\1'

    pp_text = pp_text.str.replace(pat = user_pat, repl = 'USERNAME')
    pp_text = pp_text.str.replace(pat = http_pat, repl = 'URL')
    pp_text.str.replace(pat = repeat_pat, repl = repeat_repl)
    return pp_text

cl = cPickle.loads(model)
X = regex_preprocess(tweets)
return cl.predict_proba(X)[:,1]
$$ LANGUAGE plpythonu;



In [ ]:

    
%%showsql
SELECT unnest(tweets) tweet, unnest(mdl.apply_sentiment_model(model, tweets)) polarity
FROM
mdl.sentiment_model,
(SELECT array['i am so ridiculously happy!!',
              'i am very very mad and angry',
              'steph curry is a basketball player'] tweets)f



In [ ]:

    
%%showsql

--# build table 
DROP TABLE IF EXISTS mdl.tweets_test_results;
CREATE TABLE mdl.tweets_test_results
AS
SELECT unnest(tweets),
       round(unnest(mdl.apply_sentiment_model(model,tweets))) prediction,
       unnest(polarities) polarity
FROM
mdl.sentiment_model,
(SELECT array_agg(text) tweets, array_agg(greatest(polarity-3,0)) polarities
FROM mdl.tweets_test
WHERE polarity != 2 --#neutral tweets
)f1;

--# check accuracy of model
SELECT 1 - AVG(ABS(prediction - polarity)) accuracy
FROM mdl.tweets_test_results;

Appendix



In [ ]:

    
%%showsql
SELECT greatest(4-1,5)

Load twitter data



In [ ]:

    
# data downloaded from http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']
dftrain = pd.read_csv('stanford-sentiment-twitter-data/training.1600000.processed.noemoticon.csv',
                      header = None,
                      encoding ='ISO-8859-1')
dftest = pd.read_csv('stanford-sentiment-twitter-data/testdata.manual.2009.06.14.csv',
                     header = None,
                     encoding ='ISO-8859-1')
dftrain.columns = columns
dftest.columns = columns

Upload data to db



In [ ]:

    
def df_add_id_train(df,is_train):
    df.insert(0,'id',df.index.tolist())
    df.insert(1,'is_train',[is_train]*df.shape[0])
    return df

# train data
dftrain_export = dftrain.copy()
dftrain_export = dftrain_export[['polarity','text']]
dftrain_export = df_add_id_train(dftrain_export,1)
dftrain_export.to_sql('tweets_train', engine, schema='mdl', index = False, if_exists = 'replace', chunksize=10000)

# test data
dftest_export = dftest.copy()
dftest_export = dftest_export[['polarity','text']]
dftest_export = df_add_id_train(dftest_export,1)
dftest_export.to_sql('tweets_test', engine, schema='mdl', index = False, if_exists = 'replace', chunksize=10000)