In [1]:
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pandas as pd
import numpy as np
import hazm
import os

In [2]:
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')

In [3]:
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'), 
                        sep='\t')

In [4]:
hotel_comment = hotel_pol['comment'].tolist()

In [5]:
# Binary bag of words representation
vectorizer = CountVectorizer(
    binary=True,
    tokenizer=nk.Preprocessor().tokenize,
    preprocessor=nk.Preprocessor().clean,
    max_features=20)
train_data_features = vectorizer.fit_transform(hotel_comment)
train_data_features.shape


Out[5]:
(4000, 20)

In [1]:
df = pd.DataFrame(train_data_features.toarray(),
             columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-dc3d19e16e04> in <module>()
----> 1 df = pd.DataFrame(train_data_features.toarray(),
      2              columns=vectorizer.get_feature_names()).head(10)
      3 nk.dataframe2png(df, height=400)

NameError: name 'pd' is not defined

In [7]:
# Term Frequency bag of words representation
vectorizer = CountVectorizer(
    tokenizer=nk.Preprocessor().tokenize,
    preprocessor=nk.Preprocessor().clean,
    max_features=30)
train_data_features = vectorizer.fit_transform(hotel_comment)

df = pd.DataFrame(train_data_features.toarray(),
             columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)


Out[7]:

In [8]:
# Term Frequency bag of words representation with ngram
vectorizer = CountVectorizer(
    ngram_range=(1,3),
    tokenizer=nk.Preprocessor().tokenize,
    preprocessor=nk.Preprocessor().clean,
    max_features=30)
train_data_features = vectorizer.fit_transform(hotel_comment)

df = pd.DataFrame(train_data_features.toarray(),
             columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)


Out[8]:

In [9]:
# tf-idf representation with ngram
vectorizer = TfidfVectorizer(
    ngram_range=(1,3),
    tokenizer=nk.Preprocessor().tokenize,
    preprocessor=nk.Preprocessor().clean,
    max_features=15)
train_data_features = vectorizer.fit_transform(hotel_comment)

df = pd.DataFrame(train_data_features.toarray(),
             columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)


Out[9]:

In [10]:
# Create term-document matrix proper for feature selection
vectorizer = CountVectorizer(
    ngram_range=(1,3),
    tokenizer=nk.Preprocessor().tokenize,
    preprocessor=nk.Preprocessor().clean,
    max_features=5000)
train_data_features = vectorizer.fit_transform(hotel_comment)

In [11]:
# ChiSquare feature selection
feature_obj = SelectKBest(chi2, k=15).fit(
    train_data_features, 
    hotel_pol["c"].tolist())
mask = feature_obj.get_support()

# Select features by masking
new_features = train_data_features[:,mask].toarray()
new_cols = np.array(vectorizer.get_feature_names())[mask]

# Show
df = pd.DataFrame(new_features, columns=new_cols).head(10)
nk.dataframe2png(df, height=400)


Out[11]: