In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import hazm
import os

In [3]:
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')
# data_path = '../nazarkav/data

In [4]:
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'), sep='\t')

In [5]:
hotel_comment = hotel_pol['comment'].tolist()

In [6]:
vectorizer = CountVectorizer(ngram_range=(1,3),
    tokenizer=nk.Preprocessor(stem=False).tokenize,
    preprocessor=nk.Cleaner().clean,
    max_features=30000)
train_data_features = vectorizer.fit_transform(hotel_comment)

In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_new = SelectKBest(chi2, k=10000).fit_transform(train_data_features, hotel_pol["c"].tolist())

In [14]:
X_new.shape


Out[14]:
(4000, 10000)

In [15]:
# Show 10 most frequent words in dataset
# Word frequence
dist = X_new.sum(axis=0)
# Convert matrix to array
dist = np.squeeze(np.asarray(dist))
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()

sorted(zip(dist, vocab), reverse=True)[:10]


Out[15]:
[(23490, 'بودیم اتاق'),
 (13878, 'اسلامی'),
 (12734, 'افتضاح بود در'),
 (11651, 'جرات میتونم بگم'),
 (10775, 'اینه که تا'),
 (7690, 'از اسکله'),
 (5853, 'بود ولی بعضی'),
 (5667, 'از حوله'),
 (4917, 'از بابت'),
 (4188, 'این تجربه')]

In [50]:
# Feature reduction
# from sklearn.decomposition import TruncatedSVD
# from sklearn.preprocessing import Normalizer
# lsa = TruncatedSVD(2000, algorithm = 'randomized')
# dtm_lsa = lsa.fit_transform(X_new)
# dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation as cv
X_train, X_test, y_train, y_test = cv.train_test_split(
    X_new,
    hotel_pol["c"].tolist(), 
    test_size=0.3, 
    random_state=0)

In [59]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

# Initialize a Random Forest classifier with 100 trees
clf = MultinomialNB()

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run



cv.cross_val_score( clf, X_new, hotel_pol["c"].tolist(), cv=3).mean()


Out[59]:
0.9207475841658751

In [60]:
clf = clf.fit( X_train, y_train )
clf.score(X_test, y_test)


Out[60]:
0.9341666666666667

In [61]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
target_names = ['pos', 'neg']
print(classification_report(y_test, y_pred, target_names=target_names))


             precision    recall  f1-score   support

        pos       0.94      0.93      0.93       599
        neg       0.93      0.94      0.93       601

avg / total       0.93      0.93      0.93      1200


In [ ]: