In [4]:
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import ttest_ind
np.warnings.filterwarnings('ignore', category=DeprecationWarning)
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
# set Jupyter to display ALL output from a cell (not just last output)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# set pandas and numpy options to make print format nicer
pd.set_option("display.width",100)
pd.set_option("display.max_columns",100)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 500)
np.set_printoptions(linewidth=120, threshold=5000, edgeitems=50, suppress=True)
seed = 42
In [5]:
np.random.seed(seed=seed)
print('reading: data.tsv into revs dataframe...')
revs = pd.read_csv('data.tsv', sep=' ', quotechar='"', escapechar='\\')
print('revs dataframe:', revs.shape)
splits = pd.read_csv('splits.csv',sep='\t', dtype={'split_1':int,'split_2':int, 'split_3':int,})
print('ids dataframe:', splits.shape)
trains = []
tests = []
labels = revs[['new_id','sentiment']]
for i, col in enumerate(splits.columns):
trains.append(revs.loc[~revs.new_id.isin(splits[col]),:])
tests.append( revs.loc[ revs.new_id.isin(splits[col]), revs.columns!='sentiment'])
print('Split', i+1, trains[i].shape, tests[i].shape)
print('Writing train, test, labels csv files...')
fold=0
_ = trains[fold].to_csv('train.csv', index=False)
_ = tests [fold].to_csv('test.csv', index=False)
print('Files Saved')
In [3]:
def prep_train_test(train, test):
train['review'] = train.review.str.replace('<br /><br />',' ')
test ['review'] = test.review.str.replace('<br /><br />',' ')
stop_words=['the','with','he','she','also','made','had','out','in','his','hers','there','was','then']
cv = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,2), min_df=20, max_df=0.3)
X_train = cv.fit_transform(train.review).toarray()
X_test = cv.transform(test.review).toarray()
y_train = train.sentiment
vocab = np.array(cv.get_feature_names())
return X_train, y_train, X_test, vocab
In [5]:
%%time
split = 0
X_train, y_train, X_test, vocab = prep_train_test(revs.copy(), revs.copy())
t_test = ttest_ind(X_train[y_train==1, :], X_train[y_train==0, :])
voc_df = pd.DataFrame({'tstat': t_test.statistic, 'word': vocab})
voc_df['magn_tstat'] = voc_df.tstat.abs()
voc_df = voc_df.sort_values('magn_tstat',ascending=False)
voc_df = voc_df.head(2900)
voc_df['weight'] = np.power((voc_df.magn_tstat - voc_df.magn_tstat.min()), 1.2)
voc_df['weight'] = (voc_df['weight'] / voc_df.weight.max() * 21 * np.sign(voc_df.tstat)).round(4)
voc_df[['word','weight']].to_csv('word_weights.csv', index=False)
np.savetxt('vocab.txt',voc_df.word.values, fmt='%s')
In [6]:
vocab_slim = np.loadtxt('vocab.txt', dtype=np.str, delimiter='\n')
num_folds = len(splits.columns)
for fold in range(num_folds):
start_time = time.time()
X_train, y_train, X_test, vocab = prep_train_test(trains[fold].copy(), tests[fold].copy())
y_test = pd.merge(tests[fold][['new_id']], labels, how='left', on='new_id')
indices = np.where(np.in1d(vocab, vocab_slim))[0]
X_train = X_train[:, indices].copy()
X_test = X_test [:, indices].copy()
model = LogisticRegression(penalty='l2',C=17, random_state=seed)
_ = model.fit(X_train, y_train)
probs = model.predict_proba(X_test)[:,1]
print('L2, Split:{}, AUC:{:<7.5}, Vocab:{}, RunTime:{:6.2f} secs'.format(
fold, round(roc_auc_score(y_test.sentiment, probs),5), X_train.shape[1], round(time.time()-start_time,2)))
df = pd.DataFrame({'new_id': tests[fold].new_id, 'prob': probs.round(5)})
df.to_csv('Result_'+str(fold+1)+'.txt', index=False)
print('Created Result_'+str(fold+1)+'.txt, rows=', df.shape[0])
In [15]:
filenames = ['mysubmission.txt','Result_1.txt', 'Result_2.txt', 'Result_3.txt']
for filename in filenames:
res = pd.read_csv(filename)
y_test = pd.merge(res[['new_id']], labels, how='left', on='new_id')
print('model AUC:', round(roc_auc_score(y_test.sentiment, res.prob),5))
In [19]:
for vocab_size in [1200, 2900, 2999]:
print('Vocab:', vocab_size)
split = 0
X_train, y_train, X_test, vocab = prep_train_test(revs.copy(), revs.copy())
t_test = ttest_ind(X_train[y_train==1, :], X_train[y_train==0, :])
voc_df = pd.DataFrame({'tstat': t_test.statistic, 'word': vocab})
voc_df['magn_tstat'] = voc_df.tstat.abs()
voc_df = voc_df.sort_values('magn_tstat',ascending=False)
voc_df = voc_df.head(vocab_size)
voc_df['weight'] = np.power((voc_df.magn_tstat - voc_df.magn_tstat.min()), 1.2)
voc_df['weight'] = (voc_df['weight'] / voc_df.weight.max() * 21 * np.sign(voc_df.tstat)).round(4)
voc_df[['word','weight']].to_csv('word_weights.csv', index=False)
np.savetxt('vocab.txt',voc_df.word.values, fmt='%s')
vocab_slim = np.loadtxt('vocab.txt', dtype=np.str, delimiter='\n')
num_folds = len(splits.columns)
for fold in range(num_folds):
start_time = time.time()
X_train, y_train, X_test, vocab = prep_train_test(trains[fold].copy(), tests[fold].copy())
y_test = pd.merge(tests[fold][['new_id']], labels, how='left', on='new_id')
indices = np.where(np.in1d(vocab, vocab_slim))[0]
X_train = X_train[:, indices].copy()
X_test = X_test [:, indices].copy()
model = LogisticRegression(penalty='l2',C=17, random_state=seed)
_ = model.fit(X_train, y_train)
probs = model.predict_proba(X_test)[:,1]
print('L2, Split:{}, AUC:{:<7.5}, Vocab:{}, RunTime:{:6.2f} secs'.format(
fold, round(roc_auc_score(y_test.sentiment, probs),5), X_train.shape[1], round(time.time()-start_time,2)))
df = pd.DataFrame({'new_id': tests[fold].new_id, 'prob': probs.round(5)})
df.to_csv('Result_'+str(fold+1)+'.txt', index=False)
# print('Created Result_'+str(fold+1)+'.txt, rows=', df.shape[0])