notebook.community

Edit and run



In [1]:

    
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pandas as pd
import numpy as np
import hazm
import os



In [2]:

    
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')



In [3]:

    
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'), 
                        sep='\t')



In [4]:

    
hotel_comment = hotel_pol['comment'].tolist()



In [5]:

    
# Binary bag of words representation
vectorizer = CountVectorizer(
    binary=True,
    tokenizer=nk.Preprocessor().tokenize,
    preprocessor=nk.Preprocessor().clean,
    max_features=20)
train_data_features = vectorizer.fit_transform(hotel_comment)
train_data_features.shape









    Out[5]:





(4000, 20)



In [1]:

    
df = pd.DataFrame(train_data_features.toarray(),
             columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-dc3d19e16e04> in <module>()
----> 1 df = pd.DataFrame(train_data_features.toarray(),
      2              columns=vectorizer.get_feature_names()).head(10)
      3 nk.dataframe2png(df, height=400)

NameError: name 'pd' is not defined



In [7]:

    
# Term Frequency bag of words representation
vectorizer = CountVectorizer(
    tokenizer=nk.Preprocessor().tokenize,
    preprocessor=nk.Preprocessor().clean,
    max_features=30)
train_data_features = vectorizer.fit_transform(hotel_comment)

df = pd.DataFrame(train_data_features.toarray(),
             columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)









    Out[7]:



In [8]:

    
# Term Frequency bag of words representation with ngram
vectorizer = CountVectorizer(
    ngram_range=(1,3),
    tokenizer=nk.Preprocessor().tokenize,
    preprocessor=nk.Preprocessor().clean,
    max_features=30)
train_data_features = vectorizer.fit_transform(hotel_comment)

df = pd.DataFrame(train_data_features.toarray(),
             columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)









    Out[8]:



In [9]:

    
# tf-idf representation with ngram
vectorizer = TfidfVectorizer(
    ngram_range=(1,3),
    tokenizer=nk.Preprocessor().tokenize,
    preprocessor=nk.Preprocessor().clean,
    max_features=15)
train_data_features = vectorizer.fit_transform(hotel_comment)

df = pd.DataFrame(train_data_features.toarray(),
             columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)









    Out[9]:



In [10]:

    
# Create term-document matrix proper for feature selection
vectorizer = CountVectorizer(
    ngram_range=(1,3),
    tokenizer=nk.Preprocessor().tokenize,
    preprocessor=nk.Preprocessor().clean,
    max_features=5000)
train_data_features = vectorizer.fit_transform(hotel_comment)



In [11]:

    
# ChiSquare feature selection
feature_obj = SelectKBest(chi2, k=15).fit(
    train_data_features, 
    hotel_pol["c"].tolist())
mask = feature_obj.get_support()

# Select features by masking
new_features = train_data_features[:,mask].toarray()
new_cols = np.array(vectorizer.get_feature_names())[mask]

# Show
df = pd.DataFrame(new_features, columns=new_cols).head(10)
nk.dataframe2png(df, height=400)









    Out[11]: