In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import hazm
import os
In [3]:
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')
In [4]:
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'), sep='\t')
In [5]:
hotel_comment = hotel_pol['comment'].tolist()
In [14]:
vectorizer = TfidfVectorizer(ngram_range=(1,3),
tokenizer=nk.Preprocessor(stem=False).tokenize,
preprocessor=nk.Cleaner().clean,
max_features=50000)
train_data_features = vectorizer.fit_transform(hotel_comment)
train_data_features.shape
Out[14]:
In [15]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_new = SelectKBest(chi2, k=15000).fit_transform(train_data_features, hotel_pol["c"].tolist())
In [16]:
X_new.shape
Out[16]:
In [17]:
# Show 10 most frequent words in dataset
# Word frequence
dist = X_new.sum(axis=0)
# Convert matrix to array
dist = np.squeeze(np.asarray(dist))
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
sorted(zip(dist, vocab), reverse=True)[:10]
Out[17]:
In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation as cv
X_train, X_test, y_train, y_test = cv.train_test_split(
X_new,
hotel_pol["c"].tolist(),
test_size=0.3,
random_state=0)
In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
# Initialize a Random Forest classifier with 100 trees
clf = svm.LinearSVC()
# Fit the forest to the training set, using the bag of words as
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
cv.cross_val_score( clf, X_new, hotel_pol["c"].tolist(), cv=3).mean()
Out[19]:
In [20]:
clf = clf.fit( X_train, y_train )
clf.score(X_test, y_test)
Out[20]:
In [21]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
target_names = ['pos', 'neg']
print(classification_report(y_test, y_pred, target_names=target_names))
In [ ]: