This notebook implements an English-language tweet sentiment classifier.we are going to get a goodaccuracy on the test data containing positive and negative sentiment tweets.Training and test data was downloaded from here : http://help.sentiment140.com/for-students/
In [46]:
#from matplotlib import pyplot as plt
#import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
In [47]:
# name out col's
columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']
dftrain = pd.read_csv(r'datasets\trainingandtestdata\training.1600000.processed.noemoticon.csv',
header = None,
encoding ='ISO-8859-1')
dftest = pd.read_csv(r'datasets\trainingandtestdata\testdata.manual.2009.06.14.csv',
header = None,
encoding ='ISO-8859-1')
dftrain.columns = columns
dftest.columns = columns
dftrain
Out[47]:
In [48]:
class RegexPreprocess(object):
"""Create a preprocessing module for a tweet or data structure of tweets.
1) replace username, e.g., @crawles -> USERNAME
2) replace http links -> URL
3) replace repeated letters to two letters
"""
user_pat = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)'
http_pat = '(https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})'
repeat_pat, repeat_repl = "(.)\\1\\1+",'\\1\\1'
def __init__(self):
pass
def transform(self, X):
is_pd_series = isinstance(X, pd.core.frame.Series)
if not is_pd_series:
pp_text = pd.Series(X)
else:
pp_text = X
pp_text = pp_text.str.replace(pat = self.user_pat, repl = 'USERNAME')
pp_text = pp_text.str.replace(pat = self.http_pat, repl = 'URL')
pp_text.str.replace(pat = self.repeat_pat, repl = self.repeat_repl)
return pp_text
def fit(self, X, y=None):
return self
In [51]:
sentiment_lr = Pipeline([('regex_preprocess', RegexPreprocess()),
('count_vect', CountVectorizer(min_df = 100,
ngram_range = (1,1),
stop_words = 'english')),
('lr', LogisticRegression())])
sentiment_lr.fit(dftrain.text, dftrain.polarity)
Out[51]:
In [52]:
cv=CountVectorizer()
xtraintestcv=cv.fit_transform(["hi man how are you,you are cazy","i hope you are good"])
xtraintestcv.toarray()
Out[52]:
In [50]:
cv.get_feature_names()
Out[50]:
In [53]:
Xtest, ytest = dftest.text[dftest.polarity!=2], dftest.polarity[dftest.polarity!=2]
#Xtest,ytest=dftest.text,dftest.polarity
print(classification_report(ytest,sentiment_lr.predict(Xtest)))
In [54]:
import dill
f = open('twitter_sentiment_model.pkl','wb')
r = RegexPreprocess()
dill.dump(sentiment_lr, f)
f.close()
In [56]:
# test
f = open('twitter_sentiment_model.pkl','rb')
cl = dill.load(f)
#print(classification_report(ytest,cl.predict(Xtest)))
print(cl.predict_proba("Hello big beautiful world"))
print(cl.predict_proba("you are too bad man"))
f.close()
In [ ]: