In [4]:
import time

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import ttest_ind

np.warnings.filterwarnings('ignore', category=DeprecationWarning)
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# set Jupyter to display ALL output from a cell (not just last output)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

# set pandas and numpy options to make print format nicer
pd.set_option("display.width",100)
pd.set_option("display.max_columns",100)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 500)
np.set_printoptions(linewidth=120, threshold=5000, edgeitems=50, suppress=True)

seed = 42

Read Loans csv and Create test/train csv files


In [5]:
np.random.seed(seed=seed)

print('reading: data.tsv into revs dataframe...')
revs = pd.read_csv('data.tsv', sep=' ', quotechar='"', escapechar='\\')
print('revs dataframe:', revs.shape)

splits = pd.read_csv('splits.csv',sep='\t', dtype={'split_1':int,'split_2':int, 'split_3':int,})
print('ids dataframe:', splits.shape)

trains = []
tests = []
labels = revs[['new_id','sentiment']]
for i, col in enumerate(splits.columns):
    trains.append(revs.loc[~revs.new_id.isin(splits[col]),:])
    tests.append( revs.loc[ revs.new_id.isin(splits[col]), revs.columns!='sentiment'])
    print('Split', i+1, trains[i].shape, tests[i].shape)

print('Writing train, test, labels csv files...')
fold=0
_ = trains[fold].to_csv('train.csv', index=False)
_ = tests [fold].to_csv('test.csv',  index=False)
print('Files Saved')


reading: data.tsv into revs dataframe...
revs dataframe: (50000, 3)
ids dataframe: (25000, 3)
Split 1 (25000, 3) (25000, 2)
Split 2 (25000, 3) (25000, 2)
Split 3 (25000, 3) (25000, 2)
Writing train, test, labels csv files...
Files Saved

In [3]:
def prep_train_test(train, test):    
    train['review'] = train.review.str.replace('<br /><br />',' ')
    test ['review'] =  test.review.str.replace('<br /><br />',' ')

    stop_words=['the','with','he','she','also','made','had','out','in','his','hers','there','was','then'] 

    cv = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,2), min_df=20, max_df=0.3)
    X_train = cv.fit_transform(train.review).toarray()
    X_test  = cv.transform(test.review).toarray()
    
    y_train = train.sentiment
    vocab = np.array(cv.get_feature_names())
    return X_train, y_train, X_test, vocab

Create vocab.txt and word_weights.csv files from t-test


In [5]:
%%time
split = 0
X_train, y_train, X_test, vocab = prep_train_test(revs.copy(), revs.copy())

t_test = ttest_ind(X_train[y_train==1, :], X_train[y_train==0, :])

voc_df = pd.DataFrame({'tstat': t_test.statistic, 'word': vocab})
voc_df['magn_tstat'] = voc_df.tstat.abs()
voc_df = voc_df.sort_values('magn_tstat',ascending=False)

voc_df = voc_df.head(2900)
voc_df['weight'] = np.power((voc_df.magn_tstat - voc_df.magn_tstat.min()), 1.2)
voc_df['weight'] = (voc_df['weight'] / voc_df.weight.max() * 21 * np.sign(voc_df.tstat)).round(4)

voc_df[['word','weight']].to_csv('word_weights.csv', index=False)
np.savetxt('vocab.txt',voc_df.word.values, fmt='%s')


CPU times: user 1min 25s, sys: 55.6 s, total: 2min 20s
Wall time: 2min 35s

Run models using vocab files


In [6]:
vocab_slim = np.loadtxt('vocab.txt', dtype=np.str, delimiter='\n')

num_folds = len(splits.columns)
for fold in range(num_folds):
    start_time = time.time()
    X_train, y_train, X_test, vocab = prep_train_test(trains[fold].copy(), tests[fold].copy())
    y_test = pd.merge(tests[fold][['new_id']], labels, how='left', on='new_id')

    indices = np.where(np.in1d(vocab, vocab_slim))[0]
    X_train = X_train[:, indices].copy()
    X_test  = X_test [:, indices].copy()
    
    model = LogisticRegression(penalty='l2',C=17, random_state=seed)
    _ = model.fit(X_train, y_train)
    probs = model.predict_proba(X_test)[:,1]
    print('L2, Split:{}, AUC:{:<7.5}, Vocab:{}, RunTime:{:6.2f} secs'.format(
        fold, round(roc_auc_score(y_test.sentiment, probs),5), X_train.shape[1], round(time.time()-start_time,2))) 
    df = pd.DataFrame({'new_id': tests[fold].new_id, 'prob': probs.round(5)})
    df.to_csv('Result_'+str(fold+1)+'.txt', index=False)
    print('Created Result_'+str(fold+1)+'.txt, rows=', df.shape[0])


L2, Split:0, AUC:0.96685, Vocab:2889, RunTime: 30.57 secs
Created Result_1.txt, rows= 25000
L2, Split:1, AUC:0.9658 , Vocab:2890, RunTime: 30.01 secs
Created Result_2.txt, rows= 25000
L2, Split:2, AUC:0.96612, Vocab:2888, RunTime: 30.50 secs
Created Result_3.txt, rows= 25000

Check Submission File (generated from mymain.py)


In [15]:
filenames = ['mysubmission.txt','Result_1.txt', 'Result_2.txt', 'Result_3.txt']
for filename in filenames:
    res = pd.read_csv(filename)
    y_test = pd.merge(res[['new_id']], labels, how='left', on='new_id')
    print('model AUC:', round(roc_auc_score(y_test.sentiment, res.prob),5))


model AUC: 0.96612
model AUC: 0.96685
model AUC: 0.9658
model AUC: 0.96612

Tune vocab size


In [19]:
for vocab_size in [1200, 2900, 2999]:
    print('Vocab:', vocab_size)
    split = 0
    X_train, y_train, X_test, vocab = prep_train_test(revs.copy(), revs.copy())

    t_test = ttest_ind(X_train[y_train==1, :], X_train[y_train==0, :])

    voc_df = pd.DataFrame({'tstat': t_test.statistic, 'word': vocab})
    voc_df['magn_tstat'] = voc_df.tstat.abs()
    voc_df = voc_df.sort_values('magn_tstat',ascending=False)

    voc_df = voc_df.head(vocab_size)
    voc_df['weight'] = np.power((voc_df.magn_tstat - voc_df.magn_tstat.min()), 1.2)
    voc_df['weight'] = (voc_df['weight'] / voc_df.weight.max() * 21 * np.sign(voc_df.tstat)).round(4)

    voc_df[['word','weight']].to_csv('word_weights.csv', index=False)
    np.savetxt('vocab.txt',voc_df.word.values, fmt='%s')
    vocab_slim = np.loadtxt('vocab.txt', dtype=np.str, delimiter='\n')

    num_folds = len(splits.columns)
    for fold in range(num_folds):
        start_time = time.time()
        X_train, y_train, X_test, vocab = prep_train_test(trains[fold].copy(), tests[fold].copy())
        y_test = pd.merge(tests[fold][['new_id']], labels, how='left', on='new_id')

        indices = np.where(np.in1d(vocab, vocab_slim))[0]
        X_train = X_train[:, indices].copy()
        X_test  = X_test [:, indices].copy()

        model = LogisticRegression(penalty='l2',C=17, random_state=seed)
        _ = model.fit(X_train, y_train)
        probs = model.predict_proba(X_test)[:,1]
        print('L2, Split:{}, AUC:{:<7.5}, Vocab:{}, RunTime:{:6.2f} secs'.format(
            fold, round(roc_auc_score(y_test.sentiment, probs),5), X_train.shape[1], round(time.time()-start_time,2))) 
        df = pd.DataFrame({'new_id': tests[fold].new_id, 'prob': probs.round(5)})
        df.to_csv('Result_'+str(fold+1)+'.txt', index=False)
#         print('Created Result_'+str(fold+1)+'.txt, rows=', df.shape[0])


Vocab: 1200
L2, Split:0, AUC:0.96163, Vocab:1199, RunTime: 27.54 secs
L2, Split:1, AUC:0.96014, Vocab:1200, RunTime: 27.24 secs
L2, Split:2, AUC:0.96069, Vocab:1200, RunTime: 26.71 secs
Vocab: 2900
L2, Split:0, AUC:0.96685, Vocab:2889, RunTime: 31.68 secs
L2, Split:1, AUC:0.9658 , Vocab:2890, RunTime: 29.71 secs
L2, Split:2, AUC:0.96612, Vocab:2888, RunTime: 29.53 secs
Vocab: 2999
L2, Split:0, AUC:0.96677, Vocab:2987, RunTime: 31.15 secs
L2, Split:1, AUC:0.96587, Vocab:2987, RunTime: 29.50 secs
L2, Split:2, AUC:0.96616, Vocab:2985, RunTime: 29.72 secs