In [1]:
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')
start_time = time.time()
dataset = pd.read_csv('data/Sentiment Analysis Dataset.csv',error_bad_lines=False)
del dataset["ItemID"]
del dataset['SentimentSource']
elapsed_time = time.time() - start_time
print elapsed_time
print dataset.shape
print len(dataset)
dataset.head()
Out[1]:
In [2]:
#dataset[:10].iterrows()
test_train = pd.DataFrame()
test_train = dataset
test_train.head()
Out[2]:
In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import HTMLParser # In Python 3.4+ import html
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()
def Clean(unescaped_tweet):
'''This function takes a tweet as input and returns a tokenizing list.'''
tokenizer = RegexpTokenizer(r'\w+')
#tokenize words
cleaned_tweet_tokens = tokenizer.tokenize(unescaped_tweet.lower())
#remove stop words
#cleaned_tweet_tokens = [word for word in cleaned_tweet_tokens if word not in stopwords.words('english')]
#cleaned_tweet_tokens = [ ps.stem(w) for w in cleaned_tweet_tokens]
return cleaned_tweet_tokens
In [4]:
# start_time = time.time()
# test_train['token'] = test_train['SentimentText'].apply(lambda tweet: Clean(tweet))
# test_train.head()
# elapsed_time = time.time() - start_time
# print elapsed_time
In [5]:
test_train.head()
Out[5]:
In [ ]:
In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
import nltk
stemmer = SnowballStemmer('english')
analyzer = TfidfVectorizer().build_analyzer()
def stemmed_words(doc):
return (stemmer.stem(w) for w in analyzer(doc))
vect = TfidfVectorizer(analyzer=stemmed_words,
tokenizer=nltk.tokenize.casual.TweetTokenizer,
stop_words='english',
#min_df = 0.001, #dont include words that appear in less than x% of tweets
#max_df = 0.1
)
#test stemmer
#print(vect.fit_transform(sm_set.head()[:10]))
#print(vect.get_feature_names())
In [6]:
from sklearn.utils import shuffle
In [11]:
#keep only a sample
sm_set = pd.DataFrame(shuffle(test_train)[:100000]).reset_index(drop=True)
In [12]:
sm_set.SentimentText.head()
Out[12]:
In [14]:
# based on the text of each tweet, create a (sparse) matrix containing the occurencies of each word and store it into X.
# this is going to be our feature matrix, which we will give into the classifier to "learn" the sentiment.
#narrow down the dataset
sm_set = test_train#[:100000]
start_time = time.time()
#fit_transform is a method to create the feature matrix of the tweets based on word occurencies
X = vect.fit_transform(sm_set.SentimentText)
y = sm_set.Sentiment
elapsed_time = time.time() - start_time
print elapsed_time,'sec to fit transform',len(sm_set),'samples'
In [ ]:
In [15]:
print len(vect.get_feature_names())
print vect.get_feature_names()
In [16]:
from sklearn.model_selection import train_test_split
In [17]:
#split the dataset in a training (X_train, y_train) and test dataset (X_test,y_test)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
In [ ]:
In [ ]:
In [18]:
from sklearn.naive_bayes import MultinomialNB
In [19]:
#train the classifier
start_time = time.time()
clf = MultinomialNB()
clf.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print elapsed_time
In [20]:
print vect.get_feature_names()[::len(vect.get_feature_names())/40]
In [21]:
len(vect.get_feature_names())
Out[21]:
In [22]:
vect
Out[22]:
In [23]:
from sklearn.metrics import classification_report
In [24]:
print "Results for %i training samples and %i test samples (trained on %f sec)" %(len(y_train),len(y_test),elapsed_time)
print classification_report(y_test,clf.predict(X_test))
In [ ]:
By now, vect has a vocabulary including all words and we will trim that using a statistical method, such as chi2
In [ ]:
In [25]:
from sklearn.feature_selection import chi2,f_classif,SelectPercentile
In [26]:
import matplotlib.pyplot as plt
In [27]:
y_train_bool = map(lambda x: x==1,y_train)
In [28]:
pd.Series(f_classif(X_train,y_train_bool)[0]).plot()
plt.show()
In [29]:
pd.Series(f_classif(X_train,y_train_bool)[1]).plot()
plt.show()
In [ ]:
In [ ]:
In [30]:
selector = SelectPercentile(chi2, percentile=1)
In [31]:
selector.fit(X_train,y_train)
Out[31]:
In [32]:
clf.fit(selector.transform(X_train),y_train)
Out[32]:
In [33]:
#predict
results = clf.predict(selector.transform(X_test))
print classification_report(y_test,results)
In [ ]:
In [ ]:
Logistic Regression
In [ ]:
from sklearn.linear_model import LogisticRegression
In [ ]:
clf = LogisticRegression('l2')
In [ ]:
# fit a variable selector in our data
selector = SelectPercentile(chi2, percentile=10)
selector.fit(X_train,y_train)
In [ ]:
#fit the model
clf.fit(selector.transform(X_train),y_train)
In [ ]:
#predict
results = clf.predict(selector.transform(X_test))
print classification_report(y_test,results)
In [ ]:
In [ ]:
sample_text = ['Aris is a bit dubtful about me being a smart ass',
'aris doesnt love sklearn yet',
'but he will definitely love it soon',
'fuck','bad','amazing', 'this is a sentence',
'this is a bad sentence',]
In [ ]:
vect.transform(sample_text)
In [ ]:
for i,sent in enumerate(sample_text):
print sent,clf.predict(vect.transform(sample_text))[i]
In [ ]:
In [ ]:
In [405]:
time.localtime()[1:5]
Out[405]:
In [278]:
#save the clf classifier in a file to load it in a different notebook/at a different time
timestr = "%i-%i_%i,%i"%time.localtime()[1:5]
from sklearn.externals import joblib
joblib.dump(clf, 'trained models/'+'descr'+timestr+'.pkl')
joblib.dump(vect, 'trained models/vect'+'descr'+timestr+'.pkl')
#load it later with:
#clf = joblib.load('NaiveBayesCl_67k_tweets.pkl')
In [ ]:
import pickle
s = pickle.dumps(vect)
vec2 = pickle.loads(s)
In [ ]: