In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2
from __future__ import division
from collections import defaultdict, namedtuple
import cPickle as pickle
from datetime import datetime, timedelta
from functools import partial
import inspect
import json
import os
import re
import sys
import numpy as np
import pandas as pd
import seaborn as sn
import sklearn as sl
In [2]:
if os.name == 'nt':
TRAIN_PATH = r'D:\train.csv'
PTRAIN_PATH = r'D:\train_preprocessed_all.csv'
TEST_PATH = r'D:\test.csv'
GOOGNEWS_PATH = r'D:\GoogleNews-vectors-negative300.bin.gz'
VOCAB_PATH = r'D:\big.txt'
else:
TRAIN_PATH = r'/media/mtambos/speedy/train.csv'
PTRAIN_PATH = r'/media/mtambos/speedy/train_preprocessed_all.csv'
TEST_PATH = r'/media/mtambos/speedy/test.csv'
GOOGNEWS_PATH = r'/media/mtambos/speedy/GoogleNews-vectors-negative300.bin.gz'
VOCAB_PATH = r'/media/mtambos/speedy/big.txt'
#df_orig = pd.read_csv(TRAIN_PATH, index_col="ID")
df = pd.read_csv(PTRAIN_PATH, index_col="ID")
#df
In [25]:
bool_cols = ['VAR_0008', 'VAR_0009', 'VAR_0010',
'VAR_0011', 'VAR_0012', 'VAR_0043',
'VAR_0196', 'VAR_0226', 'VAR_0229',
'VAR_0230', 'VAR_0232', 'VAR_0236',
'VAR_0239']
bool_cols += [c for c in df.columns if df[c].dtype == bool]
bool_cols = list(set(bool_cols))
int_cols = ['VAR_0013', 'VAR_0045', 'VAR_0198',
'VAR_0227', 'VAR_0231', 'VAR_0233',
'VAR_0237', 'VAR_0241']
int_cols += [c for c in df.columns if df[c].dtype == int]
int_cols = list(set(int_cols))
In [5]:
neg_samples_count = len(df['target'][df['target']==0])
pos_samples_count = len(df['target'][df['target']==1])
print '%s negative samples; %.2f%% of total' % (neg_samples_count, neg_samples_count/len(df)*100)
print '%s positive samples; %.2f%% of total' % (pos_samples_count, pos_samples_count/len(df)*100)
In [26]:
def clean_bools(bool_val):
if isinstance(bool_val, bool):
return bool_val
try:
return bool(bool_val)
except:
return np.nan
df[bool_cols] = df[bool_cols].applymap(clean_bools)
df[bool_cols] = df[bool_cols].astype(np.bool)
df[bool_cols]
Out[26]:
In [27]:
def clean_ints(int_val):
if isinstance(int_val, int):
return int_val
try:
return int(int_val)
except:
return np.nan
df[int_cols] = df[int_cols].applymap(clean_ints)
df[int_cols] = df[int_cols].astype(np.int)
df[int_cols]
Out[27]:
In [28]:
X = df.std(skipna=True)
X.sort()
In [30]:
zero_std_cols = X[X == 0].index.tolist()
zero_std_cols
Out[30]:
In [21]:
df.drop_duplicates(inplace=True)
In [32]:
cols_to_drop = set()
for i in range(len(bool_cols) - 1):
c1 = bool_cols[i]
print "Checking duplicates of column " + c1
for j in range(i+1, len(bool_cols)):
c2 = bool_cols[j]
if (df[c1] == df[c2]).all():
cols_to_drop.add(c2)
print "Duplicates found: " + str(cols_to_drop)
In [35]:
df = df.drop(cols_to_drop, axis=1)
In [42]:
for c in cols_to_drop:
bool_cols.remove(c)
In [38]:
cols_to_drop = set()
for i in range(len(int_cols) - 1):
c1 = int_cols[i]
for j in range(i+1, len(int_cols)):
c2 = int_cols[j]
if (df[c1] == df[c2]).all():
cols_to_drop.add(c2)
print "Duplicates found: " + str(cols_to_drop)
In [53]:
nan_cols = df[bool_cols + int_cols].isnull().any()
nan_cols[nan_cols]
Out[53]:
So, no missing values to impute.
In [54]:
nan_cols = df[bool_cols + int_cols].isnull().all()
nan_cols[nan_cols]
Out[54]:
In [55]:
df.to_csv(PTRAIN_PATH)
In [56]:
with open('int_cols.pickle', 'wb') as fp:
pickle.dump(int_cols, fp)
In [57]:
with open('bool_cols.pickle', 'wb') as fp:
pickle.dump(bool_cols, fp)