In [1]:
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import cross_validation as cv
from sklearn import svm
import pandas as pd
import numpy as np
import itertools
import pickle
import time
import hazm
import os
import gc

In [2]:
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
import nazarkav as nk
data_path = os.path.join(nk.__path__[0], 'data')

In [4]:
# reset cache file
def reset_cache():
    pickle.dump( {}, open( "cache.p", "wb" ) )

In [5]:
hotel_pol = pd.read_csv(os.path.join(data_path, 'hotel-polarity.tsv'), 
                        sep='\t')
hotel_comment = hotel_pol['comment'].tolist()

In [6]:
def binary_bow(maxf=None, ng=(1,1)):
    vectorizer = CountVectorizer(
        ngram_range=ng,
        binary=True,
        tokenizer=nk.Preprocessor().tokenize,
        preprocessor=nk.Preprocessor().clean,
        max_features=maxf)
    term_doc = vectorizer.fit_transform(hotel_comment)
    c = len(vectorizer.get_feature_names())
    return term_doc, c

In [7]:
def tf_bow(maxf=None, ng=(1,1)):
    vectorizer = CountVectorizer(
        ngram_range=ng,
        tokenizer=nk.Preprocessor().tokenize,
        preprocessor=nk.Preprocessor().clean,
        max_features=maxf)
    term_doc = vectorizer.fit_transform(hotel_comment)
    c = len(vectorizer.get_feature_names())
    return term_doc, c

In [8]:
def tfidf_bow(maxf=None, ng=(1,1)):
    vectorizer = TfidfVectorizer(
        ngram_range=ng,
        binary=True,
        tokenizer=nk.Preprocessor().tokenize,
        preprocessor=nk.Preprocessor().clean,
        max_features=maxf)
    term_doc = vectorizer.fit_transform(hotel_comment)
    c = len(vectorizer.get_feature_names())
    return term_doc, c

In [9]:
ngram_range = [((1,1), 'unigram'),
               ((1,2), '1-2gram'),
               ((1,3), '1-3gram')]
def cal_num_features(ngram_range):
    num_feat = {}
    for ng_par, ng_name in ngram_range:
        term_doc,c= binary_bow(maxf=None,ng=ng_par)
        num_feat[ng_name] = c
    return num_feat
# cal_num_features() function will be called only one time
if 'num_feat' not in locals():
    num_feat = cal_num_features(ngram_range)
num_feat


Out[9]:
{'1-2gram': 223688, '1-3gram': 599287, 'unigram': 25098}

In [10]:
xstr = lambda s: '' if s is None else str(s)
def make_models(bow, ngram_range, models, max_features = [None], 
                chi2_k = [None], is_cache=True):
    cache = pickle.load( open( "cache.p", "rb" ) )
    df = pd.DataFrame(columns=['bow', 'ucf_k', 'ngram_range', 
                               'models','chi2_k','accuracy', 
                               'num_feature'])
    # iterate over combination of all 
    # [bow, max_features,ngram_range, models, chi2_k]
    iter_prod=itertools.product(bow, max_features,ngram_range, 
                                models, chi2_k)
    for i, el in enumerate(iter_prod):
        bow_func=el[0][0]
        bow_name=el[0][1]
        mf = el[1]
        ng_par = el[2][0]
        ng_name = el[2][1]
        model_obj = el[3][0]
        model_name = el[3][1]
        k = el[4]
        if k and mf and k > mf:
            continue
        cache_id=bow_name+str(mf)+ng_name+model_name + xstr(k)
        
        if mf is None: mf = num_feat[ng_name]
        if k is None : k = num_feat[ng_name]
        
        if cache.get(cache_id,None) is None or not is_cache:
            continue
#             term_doc,c= bow_func(maxf=mf,ng=ng_par)
#             labels = hotel_pol["c"].tolist()
#             if k is not None : 
#                 # if k greater than number of all features, 
#                 # don't add this record
#                 if k > c:
#                     continue
#                 term_doc = SelectKBest(chi2, k=k).fit_transform(
#                     term_doc, labels)
#             acc = cv.cross_val_score(model_obj, 
#                                      term_doc, 
#                                      labels, 
#                                      cv=5).mean()
            
#             gc.collect()
#             cache[cache_id]= acc
        
        df.loc[i] = [bow_name,mf,ng_name,model_name,
                     k,cache[cache_id], num_feat[ng_name]]
#          print([bow_name,str(mf),ng_name,model_name,
#                 str(k),cache[cache_id]])
    pickle.dump( cache, open( "cache.p", "wb" ) )
    return df

In [11]:
bow = [(binary_bow,'Binary'),
       (tf_bow,'TF'),
       (tfidf_bow,'TF-IDF')]

max_features = [100, 500, 1000, 3000, 5000, 10000, 
                13000, 15000, 20000, 25000, 30000, 
                40000, 50000, 70000, None]

ngram_range = [((1,1), 'unigram'),
               ((1,2), '1-2gram'),
               ((1,3), '1-3gram')]

models = [(MultinomialNB(),'NaiveBayes'),
          (svm.LinearSVC(),'SVM'),
          (KNeighborsClassifier(n_neighbors=40),'40NN'),
          (KNeighborsClassifier(n_neighbors=100),'100NN'),
          (KNeighborsClassifier(n_neighbors=300),'300NN'),
          (LogisticRegression(), 'LogesticRegression'),
#     (tree.DecisionTreeClassifier(),'DecisionTree'),
#     (RandomForestClassifier(n_estimators = 100), 'RandomForest' )
         ]

chi2_k = [300, 1000, 7000, 10000, 15000, 17000, 
          20000, 25000, 30000, 40000, 50000, None]

In [12]:
start = time.time()
df = make_models(bow,ngram_range, models, 
                 max_features=max_features, chi2_k=chi2_k)
elapsed=time.time()-start

In [13]:
df.count()


Out[13]:
bow            3618
ucf_k          3618
ngram_range    3618
models         3618
chi2_k         3618
accuracy       3618
num_feature    3618
dtype: int64

In [14]:
df = df.reset_index(drop=True)
pickle.dump(df, open('models_dataframe.p', 'wb'))

In [19]:
df.reindex(np.random.permutation(df.index))


Out[19]:
bow ucf_k ngram_range models chi2_k accuracy num_feature
1253 TF 1000 1-2gram LogesticRegression 223688 0.83400 223688
3106 TF-IDF 40000 1-3gram 40NN 20000 0.63450 599287
156 Binary 20000 unigram SVM 15000 0.84600 25098
3593 TF-IDF 599287 1-3gram 100NN 599287 0.78775 599287
1467 TF 20000 1-3gram 40NN 17000 0.71475 599287
1127 Binary 223688 1-2gram LogesticRegression 17000 0.88650 223688
3561 TF-IDF 599287 1-3gram SVM 10000 0.91225 599287
3014 TF-IDF 40000 unigram 100NN 7000 0.50325 25098
2294 TF 223688 1-2gram 40NN 7000 0.74800 223688
3032 TF-IDF 40000 unigram LogesticRegression 15000 0.87475 25098
472 Binary 30000 unigram LogesticRegression 300 0.85975 25098
2554 TF-IDF 15000 1-3gram 300NN 599287 0.85275 599287
1836 TF 40000 1-2gram NaiveBayes 20000 0.92150 223688
2885 TF-IDF 30000 unigram LogesticRegression 1000 0.87350 25098
2674 TF-IDF 20000 1-3gram 40NN 20000 0.82350 599287
2981 TF-IDF 30000 1-3gram LogesticRegression 1000 0.88825 599287
1242 TF 1000 unigram NaiveBayes 25098 0.85275 25098
3582 TF-IDF 599287 1-3gram 100NN 300 0.66525 599287
2010 TF 50000 1-2gram 300NN 15000 0.68675 223688
1391 TF 20000 unigram LogesticRegression 1000 0.86300 25098
1375 TF 20000 unigram 100NN 1000 0.64900 25098
2085 TF 70000 unigram SVM 25098 0.83375 25098
401 Binary 25000 1-3gram 40NN 1000 0.79800 599287
2133 TF 70000 1-2gram SVM 223688 0.86250 223688
2515 TF-IDF 10000 1-3gram SVM 599287 0.87500 599287
2164 TF 70000 1-2gram LogesticRegression 20000 0.88475 223688
3355 TF-IDF 70000 1-2gram 100NN 223688 0.81225 223688
3067 TF-IDF 40000 1-2gram 100NN 223688 0.82850 223688
1188 Binary 599287 1-3gram 300NN 20000 0.76625 599287
3301 TF-IDF 70000 unigram 100NN 1000 0.79950 25098
... ... ... ... ... ... ... ...
3368 TF-IDF 70000 1-2gram LogesticRegression 15000 0.90250 223688
2917 TF-IDF 30000 1-2gram 100NN 1000 0.82000 223688
2763 TF-IDF 25000 1-2gram SVM 223688 0.88075 223688
2032 TF 50000 1-3gram SVM 7000 0.86500 599287
3240 TF-IDF 50000 1-3gram SVM 15000 0.92600 599287
2606 TF-IDF 20000 1-2gram NaiveBayes 7000 0.90975 223688
3378 TF-IDF 70000 1-3gram NaiveBayes 20000 0.92750 599287
1290 TF 5000 1-3gram NaiveBayes 599287 0.87075 599287
2007 TF 50000 1-2gram 300NN 1000 0.75550 223688
1203 Binary 599287 1-3gram LogesticRegression 40000 0.89625 599287
1053 Binary 25098 unigram LogesticRegression 300 0.85975 25098
180 Binary 20000 unigram 300NN 15000 0.51275 25098
2012 TF 50000 1-2gram 300NN 20000 0.67000 223688
1434 TF 20000 1-2gram 300NN 15000 0.65675 223688
1489 TF 20000 1-3gram LogesticRegression 10000 0.87950 599287
359 Binary 25000 1-2gram 40NN 223688 0.69425 223688
499 Binary 30000 1-2gram 40NN 10000 0.73675 223688
3611 TF-IDF 599287 1-3gram LogesticRegression 17000 0.89350 599287
1817 TF 40000 unigram 300NN 10000 0.54225 25098
2959 TF-IDF 30000 1-3gram 40NN 10000 0.64500 599287
2641 TF-IDF 20000 1-2gram 300NN 17000 0.65100 223688
3033 TF-IDF 40000 unigram LogesticRegression 17000 0.87475 25098
3581 TF-IDF 599287 1-3gram 40NN 599287 0.79350 599287
3605 TF-IDF 599287 1-3gram 300NN 599287 0.77900 599287
1809 TF 40000 unigram 100NN 10000 0.57475 25098
854 Binary 50000 1-3gram 300NN 20000 0.75125 599287
2260 TF 25098 unigram LogesticRegression 1000 0.86300 25098
2241 TF 25098 unigram 100NN 300 0.71650 25098
1211 TF 100 unigram LogesticRegression 25098 0.81400 25098
1891 TF 40000 1-3gram SVM 17000 0.87425 599287

3618 rows × 7 columns


In [20]:
sdf = df.reindex(np.random.permutation(df.index))
with open('modeling_table.tex', 'w') as f:
    f.write(sdf.head().to_latex())

In [ ]: