In [ ]:
import csv
import re
subsetData = open("SAsubset.csv", "r")
for row in csv.DictReader(subsetData):
print row['Sentiment'], row['SentimentText']
subsetData.close()
Typical Noisy data
In [ ]:
def getData(csvFname):
sent = []
tweet = []
dataSource = open(csvFname, "r")
for row in csv.DictReader(dataSource):
sent.append(row['Sentiment'])
tweet.append(row['SentimentText'])
dataSource.close()
return sent, tweet
In [ ]:
sent, tweet = getData("SAsubset.csv")
from scipy.stats import itemfreq
itemfreq(sent)
In [ ]:
tweet
ballpark preprocessing: "unescape", lowercase, remove all puncts
In [ ]:
tweet[15]
In [ ]:
from HTMLParser import HTMLParser
h = HTMLParser()
print h.unescape(tweet[15])
In [ ]:
re.sub("[^\w\s]", " ", h.unescape(tweet[15])).lower()
modify the getData a little and the the 200K tweets dataset.
In [ ]:
def getData(csvFname):
h = HTMLParser()
corpus = []
dataSource = open(csvFname, "r")
for row in csv.DictReader(dataSource):
try:
corpus.append({"tweet": re.sub("[^a-zA-Z\s]", " ", h.unescape(row['SentimentText'])).lower(), "sent": int(row['Sentiment'])})
except:
continue
dataSource.close()
return corpus
corpus = getData("SA200K.csv")
In [ ]:
print len(corpus)
print corpus[2]
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer
In [ ]:
X = vectorizer.fit_transform([item['tweet'] for item in corpus])
X
In [ ]:
#X.toarray()
In [ ]:
vectorizer.get_feature_names()
In [ ]:
y = [item['sent'] for item in corpus]
Randomly split the X and y into training and test set
In [ ]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1697)
In [ ]:
X_train
In [ ]:
X_test
In [ ]:
y_train
In [ ]:
from sklearn.naive_bayes import MultinomialNB
hx_nb = MultinomialNB()
In [ ]:
hx_nb.fit(X_train, y_train)
In [ ]:
hx_nb.predict(X_train)
Evaluate the effectiveness of nbmX using F1 score
In [ ]:
from sklearn.metrics import confusion_matrix, f1_score
In [ ]:
print confusion_matrix(y_train, hx_nb.predict(X_train))
print f1_score(y_train, hx_nb.predict(X_train))
Do it on test set
In [ ]:
print confusion_matrix(y_test, hx_nb.predict(X_test))
print f1_score(y_test, hx_nb.predict(X_test))
Classify a new tweet
In [ ]:
newTweetFeatureVector = vectorizer.transform(["I feel so bad now. Let's go to hell!"])
In [ ]:
newTweetFeatureVector
In [ ]:
hx_nb.predict(newTweetFeatureVector)
In [ ]:
newTweetFeatureVector = vectorizer.transform(["scikit learn is so cool!"])
hx_nb.predict(newTweetFeatureVector)
In [ ]:
newTweetFeatureVector = vectorizer.transform(["I am feeling not good with scikit learn"])
hx_nb.predict(newTweetFeatureVector)
In [ ]:
hx_nb.predict_proba(newTweetFeatureVector)
Logistic regression with regularization (C is the regularization rate) $$ \sim O(n)$$
In [ ]:
from sklearn.linear_model import LogisticRegression
In [ ]:
hx_log = LogisticRegression(C=0.6)
In [ ]:
hx_log.fit(X_train, y_train)
In [ ]:
confusion_matrix(y_train, hx_log.predict(X_train))
In [ ]:
print "Training set F1: %s" %f1_score(y_train, hx_log.predict(X_train))
print "Test set F1: %s" %f1_score(y_test, hx_log.predict(X_test))
In [ ]:
bigramvect = CountVectorizer(ngram_range = (1,2))
In [ ]:
X_bi = bigramvect.fit_transform([item['tweet'] for item in corpus])
In [ ]:
X_bi
In [ ]:
X
In [ ]:
X_train_bi, X_test_bi, y_train_bi, y_test_bi = train_test_split(X_bi, y, test_size = 0.3, random_state=1697)
In [ ]:
bnb = MultinomialNB()
bi_nbhx = bnb.fit(X_train_bi, y_train_bi)
In [ ]:
confusion_matrix(y_train_bi, bi_nbhx.predict(X_train_bi))
In [ ]:
f1_score(y_train_bi, bi_nbhx.predict(X_train_bi))
In [ ]:
f1_score(y_test_bi, bi_nbhx.predict(X_test_bi))
In [ ]:
newTweetFeatureVector = bigramvect.transform(["I am feeling not good with scikit learn"])
bi_nbhx.predict(newTweetFeatureVector)[0]
In [ ]: