In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import hazm
import os
In [3]:
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')
# data_path = '../nazarkav/data
In [4]:
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'), sep='\t')
In [5]:
hotel_comment = hotel_pol['comment'].tolist()
In [6]:
vectorizer = CountVectorizer(ngram_range=(1,3),
tokenizer=nk.Preprocessor(stem=False).tokenize,
preprocessor=nk.Cleaner().clean,
max_features=30000)
train_data_features = vectorizer.fit_transform(hotel_comment)
In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_new = SelectKBest(chi2, k=10000).fit_transform(train_data_features, hotel_pol["c"].tolist())
In [14]:
X_new.shape
Out[14]:
In [15]:
# Show 10 most frequent words in dataset
# Word frequence
dist = X_new.sum(axis=0)
# Convert matrix to array
dist = np.squeeze(np.asarray(dist))
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
sorted(zip(dist, vocab), reverse=True)[:10]
Out[15]:
In [50]:
# Feature reduction
# from sklearn.decomposition import TruncatedSVD
# from sklearn.preprocessing import Normalizer
# lsa = TruncatedSVD(2000, algorithm = 'randomized')
# dtm_lsa = lsa.fit_transform(X_new)
# dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation as cv
X_train, X_test, y_train, y_test = cv.train_test_split(
X_new,
hotel_pol["c"].tolist(),
test_size=0.3,
random_state=0)
In [59]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
# Initialize a Random Forest classifier with 100 trees
clf = MultinomialNB()
# Fit the forest to the training set, using the bag of words as
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
cv.cross_val_score( clf, X_new, hotel_pol["c"].tolist(), cv=3).mean()
Out[59]:
In [60]:
clf = clf.fit( X_train, y_train )
clf.score(X_test, y_test)
Out[60]:
In [61]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
target_names = ['pos', 'neg']
print(classification_report(y_test, y_pred, target_names=target_names))
In [ ]: