In [1]:
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import cross_validation as cv
from sklearn import svm
import pandas as pd
import numpy as np
import itertools
import pickle
import time
import hazm
import os
import gc
In [2]:
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
In [3]:
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')
In [4]:
# reset cache file
def reset_cache():
pickle.dump( {}, open( "cache.p", "wb" ) )
In [5]:
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'),
sep='\t')
hotel_comment = hotel_pol['comment'].tolist()
In [6]:
def binary_bow(maxf=None, ng=(1,1)):
vectorizer = CountVectorizer(
ngram_range=ng,
binary=True,
tokenizer=nk.Preprocessor().tokenize,
preprocessor=nk.Preprocessor().clean,
max_features=maxf)
term_doc = vectorizer.fit_transform(hotel_comment)
c = len(vectorizer.get_feature_names())
return term_doc, c
In [7]:
def tf_bow(maxf=None, ng=(1,1)):
vectorizer = CountVectorizer(
ngram_range=ng,
tokenizer=nk.Preprocessor().tokenize,
preprocessor=nk.Preprocessor().clean,
max_features=maxf)
term_doc = vectorizer.fit_transform(hotel_comment)
c = len(vectorizer.get_feature_names())
return term_doc, c
In [8]:
def tfidf_bow(maxf=None, ng=(1,1)):
vectorizer = TfidfVectorizer(
ngram_range=ng,
binary=True,
tokenizer=nk.Preprocessor().tokenize,
preprocessor=nk.Preprocessor().clean,
max_features=maxf)
term_doc = vectorizer.fit_transform(hotel_comment)
c = len(vectorizer.get_feature_names())
return term_doc, c
In [9]:
ngram_range = [((1,1), 'unigram'),
((1,2), '1-2gram'),
((1,3), '1-3gram')]
def cal_num_features(ngram_range):
num_feat = {}
for ng_par, ng_name in ngram_range:
term_doc,c= binary_bow(maxf=None,ng=ng_par)
num_feat[ng_name] = c
return num_feat
# cal_num_features() function will be called only one time
if 'num_feat' not in locals():
num_feat = cal_num_features(ngram_range)
num_feat
Out[9]:
In [10]:
xstr = lambda s: '' if s is None else str(s)
def make_models(bow, ngram_range, models, max_features = [None],
chi2_k = [None], is_cache=True):
cache = pickle.load( open( "cache.p", "rb" ) )
df = pd.DataFrame(columns=['bow', 'ucf_k', 'ngram_range',
'models','chi2_k','accuracy',
'num_feature'])
# iterate over combination of all
# [bow, max_features,ngram_range, models, chi2_k]
iter_prod=itertools.product(bow, max_features,ngram_range,
models, chi2_k)
for i, el in enumerate(iter_prod):
bow_func=el[0][0]
bow_name=el[0][1]
mf = el[1]
ng_par = el[2][0]
ng_name = el[2][1]
model_obj = el[3][0]
model_name = el[3][1]
k = el[4]
if k and mf and k > mf:
continue
cache_id=bow_name+str(mf)+ng_name+model_name + xstr(k)
if mf is None: mf = num_feat[ng_name]
if k is None : k = num_feat[ng_name]
if cache.get(cache_id,None) is None or not is_cache:
continue
# term_doc,c= bow_func(maxf=mf,ng=ng_par)
# labels = hotel_pol["c"].tolist()
# if k is not None :
# # if k greater than number of all features,
# # don't add this record
# if k > c:
# continue
# term_doc = SelectKBest(chi2, k=k).fit_transform(
# term_doc, labels)
# acc = cv.cross_val_score(model_obj,
# term_doc,
# labels,
# cv=5).mean()
# gc.collect()
# cache[cache_id]= acc
df.loc[i] = [bow_name,mf,ng_name,model_name,
k,cache[cache_id], num_feat[ng_name]]
# print([bow_name,str(mf),ng_name,model_name,
# str(k),cache[cache_id]])
pickle.dump( cache, open( "cache.p", "wb" ) )
return df
In [11]:
bow = [(binary_bow,'Binary'),
(tf_bow,'TF'),
(tfidf_bow,'TF-IDF')]
max_features = [100, 500, 1000, 3000, 5000, 10000,
13000, 15000, 20000, 25000, 30000,
40000, 50000, 70000, None]
ngram_range = [((1,1), 'unigram'),
((1,2), '1-2gram'),
((1,3), '1-3gram')]
models = [(MultinomialNB(),'NaiveBayes'),
(svm.LinearSVC(),'SVM'),
(KNeighborsClassifier(n_neighbors=40),'40NN'),
(KNeighborsClassifier(n_neighbors=100),'100NN'),
(KNeighborsClassifier(n_neighbors=300),'300NN'),
(LogisticRegression(), 'LogesticRegression'),
# (tree.DecisionTreeClassifier(),'DecisionTree'),
# (RandomForestClassifier(n_estimators = 100), 'RandomForest' )
]
chi2_k = [300, 1000, 7000, 10000, 15000, 17000,
20000, 25000, 30000, 40000, 50000, None]
In [12]:
start = time.time()
df = make_models(bow,ngram_range, models,
max_features=max_features, chi2_k=chi2_k)
elapsed=time.time()-start
In [13]:
df.count()
Out[13]:
In [14]:
df = df.reset_index(drop=True)
pickle.dump(df, open('models_dataframe.p', 'wb'))
In [19]:
df.reindex(np.random.permutation(df.index))
Out[19]:
In [20]:
sdf = df.reindex(np.random.permutation(df.index))
with open('modeling_table.tex', 'w') as f:
f.write(sdf.head().to_latex())
In [ ]: