In [4]:
    
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import ttest_ind
np.warnings.filterwarnings('ignore', category=DeprecationWarning)
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
# set Jupyter to display ALL output from a cell (not just last output)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 
# set pandas and numpy options to make print format nicer
pd.set_option("display.width",100)
pd.set_option("display.max_columns",100)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 500)
np.set_printoptions(linewidth=120, threshold=5000, edgeitems=50, suppress=True)
seed = 42
    
In [5]:
    
np.random.seed(seed=seed)
print('reading: data.tsv into revs dataframe...')
revs = pd.read_csv('data.tsv', sep=' ', quotechar='"', escapechar='\\')
print('revs dataframe:', revs.shape)
splits = pd.read_csv('splits.csv',sep='\t', dtype={'split_1':int,'split_2':int, 'split_3':int,})
print('ids dataframe:', splits.shape)
trains = []
tests = []
labels = revs[['new_id','sentiment']]
for i, col in enumerate(splits.columns):
    trains.append(revs.loc[~revs.new_id.isin(splits[col]),:])
    tests.append( revs.loc[ revs.new_id.isin(splits[col]), revs.columns!='sentiment'])
    print('Split', i+1, trains[i].shape, tests[i].shape)
print('Writing train, test, labels csv files...')
fold=0
_ = trains[fold].to_csv('train.csv', index=False)
_ = tests [fold].to_csv('test.csv',  index=False)
print('Files Saved')
    
    
In [3]:
    
def prep_train_test(train, test):    
    train['review'] = train.review.str.replace('<br /><br />',' ')
    test ['review'] =  test.review.str.replace('<br /><br />',' ')
    stop_words=['the','with','he','she','also','made','had','out','in','his','hers','there','was','then'] 
    cv = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,2), min_df=20, max_df=0.3)
    X_train = cv.fit_transform(train.review).toarray()
    X_test  = cv.transform(test.review).toarray()
    
    y_train = train.sentiment
    vocab = np.array(cv.get_feature_names())
    return X_train, y_train, X_test, vocab
    
In [5]:
    
%%time
split = 0
X_train, y_train, X_test, vocab = prep_train_test(revs.copy(), revs.copy())
t_test = ttest_ind(X_train[y_train==1, :], X_train[y_train==0, :])
voc_df = pd.DataFrame({'tstat': t_test.statistic, 'word': vocab})
voc_df['magn_tstat'] = voc_df.tstat.abs()
voc_df = voc_df.sort_values('magn_tstat',ascending=False)
voc_df = voc_df.head(2900)
voc_df['weight'] = np.power((voc_df.magn_tstat - voc_df.magn_tstat.min()), 1.2)
voc_df['weight'] = (voc_df['weight'] / voc_df.weight.max() * 21 * np.sign(voc_df.tstat)).round(4)
voc_df[['word','weight']].to_csv('word_weights.csv', index=False)
np.savetxt('vocab.txt',voc_df.word.values, fmt='%s')
    
    
In [6]:
    
vocab_slim = np.loadtxt('vocab.txt', dtype=np.str, delimiter='\n')
num_folds = len(splits.columns)
for fold in range(num_folds):
    start_time = time.time()
    X_train, y_train, X_test, vocab = prep_train_test(trains[fold].copy(), tests[fold].copy())
    y_test = pd.merge(tests[fold][['new_id']], labels, how='left', on='new_id')
    indices = np.where(np.in1d(vocab, vocab_slim))[0]
    X_train = X_train[:, indices].copy()
    X_test  = X_test [:, indices].copy()
    
    model = LogisticRegression(penalty='l2',C=17, random_state=seed)
    _ = model.fit(X_train, y_train)
    probs = model.predict_proba(X_test)[:,1]
    print('L2, Split:{}, AUC:{:<7.5}, Vocab:{}, RunTime:{:6.2f} secs'.format(
        fold, round(roc_auc_score(y_test.sentiment, probs),5), X_train.shape[1], round(time.time()-start_time,2))) 
    df = pd.DataFrame({'new_id': tests[fold].new_id, 'prob': probs.round(5)})
    df.to_csv('Result_'+str(fold+1)+'.txt', index=False)
    print('Created Result_'+str(fold+1)+'.txt, rows=', df.shape[0])
    
    
In [15]:
    
filenames = ['mysubmission.txt','Result_1.txt', 'Result_2.txt', 'Result_3.txt']
for filename in filenames:
    res = pd.read_csv(filename)
    y_test = pd.merge(res[['new_id']], labels, how='left', on='new_id')
    print('model AUC:', round(roc_auc_score(y_test.sentiment, res.prob),5))
    
    
In [19]:
    
for vocab_size in [1200, 2900, 2999]:
    print('Vocab:', vocab_size)
    split = 0
    X_train, y_train, X_test, vocab = prep_train_test(revs.copy(), revs.copy())
    t_test = ttest_ind(X_train[y_train==1, :], X_train[y_train==0, :])
    voc_df = pd.DataFrame({'tstat': t_test.statistic, 'word': vocab})
    voc_df['magn_tstat'] = voc_df.tstat.abs()
    voc_df = voc_df.sort_values('magn_tstat',ascending=False)
    voc_df = voc_df.head(vocab_size)
    voc_df['weight'] = np.power((voc_df.magn_tstat - voc_df.magn_tstat.min()), 1.2)
    voc_df['weight'] = (voc_df['weight'] / voc_df.weight.max() * 21 * np.sign(voc_df.tstat)).round(4)
    voc_df[['word','weight']].to_csv('word_weights.csv', index=False)
    np.savetxt('vocab.txt',voc_df.word.values, fmt='%s')
    vocab_slim = np.loadtxt('vocab.txt', dtype=np.str, delimiter='\n')
    num_folds = len(splits.columns)
    for fold in range(num_folds):
        start_time = time.time()
        X_train, y_train, X_test, vocab = prep_train_test(trains[fold].copy(), tests[fold].copy())
        y_test = pd.merge(tests[fold][['new_id']], labels, how='left', on='new_id')
        indices = np.where(np.in1d(vocab, vocab_slim))[0]
        X_train = X_train[:, indices].copy()
        X_test  = X_test [:, indices].copy()
        model = LogisticRegression(penalty='l2',C=17, random_state=seed)
        _ = model.fit(X_train, y_train)
        probs = model.predict_proba(X_test)[:,1]
        print('L2, Split:{}, AUC:{:<7.5}, Vocab:{}, RunTime:{:6.2f} secs'.format(
            fold, round(roc_auc_score(y_test.sentiment, probs),5), X_train.shape[1], round(time.time()-start_time,2))) 
        df = pd.DataFrame({'new_id': tests[fold].new_id, 'prob': probs.round(5)})
        df.to_csv('Result_'+str(fold+1)+'.txt', index=False)
#         print('Created Result_'+str(fold+1)+'.txt, rows=', df.shape[0])