In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import hazm
import os

In [3]:
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')

In [4]:
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'), sep='\t')

In [5]:
hotel_comment = hotel_pol['comment'].tolist()

In [14]:
vectorizer = TfidfVectorizer(ngram_range=(1,3),
    tokenizer=nk.Preprocessor(stem=False).tokenize,
    preprocessor=nk.Cleaner().clean,
    max_features=50000)
train_data_features = vectorizer.fit_transform(hotel_comment)
train_data_features.shape


Out[14]:
(4000, 50000)

In [15]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_new = SelectKBest(chi2, k=15000).fit_transform(train_data_features, hotel_pol["c"].tolist())

In [16]:
X_new.shape


Out[16]:
(4000, 15000)

In [17]:
# Show 10 most frequent words in dataset
# Word frequence
dist = X_new.sum(axis=0)
# Convert matrix to array
dist = np.squeeze(np.asarray(dist))
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()

sorted(zip(dist, vocab), reverse=True)[:10]


Out[17]:
[(289.74408041993263, 'بود که چون'),
 (190.59178532313305, 'استفاده نکنید و'),
 (158.52230791703607, 'تلویزیون و یخچال'),
 (117.71030086040444, 'اخرین روز'),
 (102.01167259333252, 'بود ملحفه\u200cها و'),
 (86.917733149742119, 'اومدیم کیش و'),
 (86.138422086576426, 'اگه میخوای'),
 (84.690145802809724, 'به انتخاب هتل'),
 (78.620543421771217, 'آدم'),
 (70.262086466336882, 'از مزایای آن')]

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation as cv
X_train, X_test, y_train, y_test = cv.train_test_split(
    X_new,
    hotel_pol["c"].tolist(), 
    test_size=0.3, 
    random_state=0)

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

# Initialize a Random Forest classifier with 100 trees
clf = svm.LinearSVC()

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run



cv.cross_val_score( clf, X_new, hotel_pol["c"].tolist(), cv=3).mean()


Out[19]:
0.92249971110540818

In [20]:
clf = clf.fit( X_train, y_train )
clf.score(X_test, y_test)


Out[20]:
0.93166666666666664

In [21]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
target_names = ['pos', 'neg']
print(classification_report(y_test, y_pred, target_names=target_names))


             precision    recall  f1-score   support

        pos       0.92      0.94      0.93       599
        neg       0.94      0.92      0.93       601

avg / total       0.93      0.93      0.93      1200


In [ ]: