In [1]:
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pandas as pd
import numpy as np
import hazm
import os
In [2]:
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')
In [3]:
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'),
sep='\t')
In [4]:
hotel_comment = hotel_pol['comment'].tolist()
In [5]:
# Binary bag of words representation
vectorizer = CountVectorizer(
binary=True,
tokenizer=nk.Preprocessor().tokenize,
preprocessor=nk.Preprocessor().clean,
max_features=20)
train_data_features = vectorizer.fit_transform(hotel_comment)
train_data_features.shape
Out[5]:
In [1]:
df = pd.DataFrame(train_data_features.toarray(),
columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)
In [7]:
# Term Frequency bag of words representation
vectorizer = CountVectorizer(
tokenizer=nk.Preprocessor().tokenize,
preprocessor=nk.Preprocessor().clean,
max_features=30)
train_data_features = vectorizer.fit_transform(hotel_comment)
df = pd.DataFrame(train_data_features.toarray(),
columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)
Out[7]:
In [8]:
# Term Frequency bag of words representation with ngram
vectorizer = CountVectorizer(
ngram_range=(1,3),
tokenizer=nk.Preprocessor().tokenize,
preprocessor=nk.Preprocessor().clean,
max_features=30)
train_data_features = vectorizer.fit_transform(hotel_comment)
df = pd.DataFrame(train_data_features.toarray(),
columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)
Out[8]:
In [9]:
# tf-idf representation with ngram
vectorizer = TfidfVectorizer(
ngram_range=(1,3),
tokenizer=nk.Preprocessor().tokenize,
preprocessor=nk.Preprocessor().clean,
max_features=15)
train_data_features = vectorizer.fit_transform(hotel_comment)
df = pd.DataFrame(train_data_features.toarray(),
columns=vectorizer.get_feature_names()).head(10)
nk.dataframe2png(df, height=400)
Out[9]:
In [10]:
# Create term-document matrix proper for feature selection
vectorizer = CountVectorizer(
ngram_range=(1,3),
tokenizer=nk.Preprocessor().tokenize,
preprocessor=nk.Preprocessor().clean,
max_features=5000)
train_data_features = vectorizer.fit_transform(hotel_comment)
In [11]:
# ChiSquare feature selection
feature_obj = SelectKBest(chi2, k=15).fit(
train_data_features,
hotel_pol["c"].tolist())
mask = feature_obj.get_support()
# Select features by masking
new_features = train_data_features[:,mask].toarray()
new_cols = np.array(vectorizer.get_feature_names())[mask]
# Show
df = pd.DataFrame(new_features, columns=new_cols).head(10)
nk.dataframe2png(df, height=400)
Out[11]: