notebook.community

Edit and run



In [2]:

    
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import hazm
import os



In [3]:

    
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')



In [4]:

    
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'), sep='\t')



In [5]:

    
hotel_comment = hotel_pol['comment'].tolist()



In [14]:

    
vectorizer = TfidfVectorizer(ngram_range=(1,3),
    tokenizer=nk.Preprocessor(stem=False).tokenize,
    preprocessor=nk.Cleaner().clean,
    max_features=50000)
train_data_features = vectorizer.fit_transform(hotel_comment)
train_data_features.shape









    Out[14]:





(4000, 50000)



In [15]:

    
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X_new = SelectKBest(chi2, k=15000).fit_transform(train_data_features, hotel_pol["c"].tolist())



In [16]:

    
X_new.shape









    Out[16]:





(4000, 15000)



In [17]:

    
# Show 10 most frequent words in dataset
# Word frequence
dist = X_new.sum(axis=0)
# Convert matrix to array
dist = np.squeeze(np.asarray(dist))
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()

sorted(zip(dist, vocab), reverse=True)[:10]









    Out[17]:





[(289.74408041993263, 'بود که چون'),
 (190.59178532313305, 'استفاده نکنید و'),
 (158.52230791703607, 'تلویزیون و یخچال'),
 (117.71030086040444, 'اخرین روز'),
 (102.01167259333252, 'بود ملحفه\u200cها و'),
 (86.917733149742119, 'اومدیم کیش و'),
 (86.138422086576426, 'اگه میخوای'),
 (84.690145802809724, 'به انتخاب هتل'),
 (78.620543421771217, 'آدم'),
 (70.262086466336882, 'از مزایای آن')]



In [18]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation as cv
X_train, X_test, y_train, y_test = cv.train_test_split(
    X_new,
    hotel_pol["c"].tolist(), 
    test_size=0.3, 
    random_state=0)



In [19]:

    
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

# Initialize a Random Forest classifier with 100 trees
clf = svm.LinearSVC()

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run



cv.cross_val_score( clf, X_new, hotel_pol["c"].tolist(), cv=3).mean()









    Out[19]:





0.92249971110540818



In [20]:

    
clf = clf.fit( X_train, y_train )
clf.score(X_test, y_test)









    Out[20]:





0.93166666666666664



In [21]:

    
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
target_names = ['pos', 'neg']
print(classification_report(y_test, y_pred, target_names=target_names))









    



             precision    recall  f1-score   support

        pos       0.92      0.94      0.93       599
        neg       0.94      0.92      0.93       601

avg / total       0.93      0.93      0.93      1200



In [ ]: