In [38]:
%pylab inline
%load_ext autoreload
%autoreload 2
from __future__ import division
from collections import defaultdict, namedtuple
from datetime import datetime, timedelta
from functools import partial
import inspect
import json
import os
import re
import sys
import cPickle as pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
def analyze_str_columns(cols, df, only_percent=False):
print 'Total samples: %s' % len(df)
for c in cols:
print '##############################'
VAR_df = df[[c, 'target']]
unique_vals = VAR_df[c].unique()
# NaNs are the only floats among the values
non_nan = [v for v in unique_vals if type(v) == str]
str_0 = []
str_1 = []
col_names = []
for u in unique_vals:
if type(u) == str:
col_mask = (VAR_df[c] == u)
else:
col_mask = VAR_df[c].isnull()
str_0.append(len(VAR_df[col_mask & (VAR_df['target'] == 0)]))
str_1.append(len(VAR_df[col_mask & (VAR_df['target'] == 1)]))
col_names.append('%s_%s'%(c,u))
VAR_df_counts = pd.DataFrame([str_0, str_1],
columns=col_names,
index=pd.Index([0, 1], name='target'))
if not only_percent:
print "------Counts-------"
print VAR_df_counts
print "----Percentages----"
print VAR_df_counts/VAR_df_counts.sum()*100
In [2]:
if os.name == 'nt':
TRAIN_PATH = r'D:\train.csv'
PTRAIN_PATH = r'D:\train_preprocessed_float_string.csv'
TEST_PATH = r'D:\test.csv'
GOOGNEWS_PATH = r'D:\GoogleNews-vectors-negative300.bin.gz'
VOCAB_PATH = r'D:\big.txt'
else:
TRAIN_PATH = r'/media/mtambos/speedy/train.csv'
PTRAIN_PATH = r'/media/mtambos/speedy/train_preprocessed_float_string.csv'
TEST_PATH = r'/media/mtambos/speedy/test.csv'
GOOGNEWS_PATH = r'/media/mtambos/speedy/GoogleNews-vectors-negative300.bin.gz'
VOCAB_PATH = r'/media/mtambos/speedy/big.txt'
df = pd.read_csv(PTRAIN_PATH, index_col="ID")
In [3]:
str_cols = [u'VAR_0001', u'VAR_0005', u'VAR_0044',
u'VAR_0200', u'VAR_0202', u'VAR_0214',
u'VAR_0216', u'VAR_0222', u'VAR_0237',
u'VAR_0274', u'VAR_0283', u'VAR_0305',
u'VAR_0325', u'VAR_0342', u'VAR_0352',
u'VAR_0353', u'VAR_0354', u'VAR_0404',
u'VAR_0466', u'VAR_0467', u'VAR_0493',
u'VAR_1934']
try:
str_cols = [c for c in str_cols if c in df.columns and df[c].dtype==np.object]
except NameError:
pass
In [5]:
neg_samples_count = len(df['target'][df['target']==0])
pos_samples_count = len(df['target'][df['target']==1])
print '%s negative samples; %.2f%% of total' % (neg_samples_count, neg_samples_count/len(df)*100)
print '%s positive samples; %.2f%% of total' % (pos_samples_count, pos_samples_count/len(df)*100)
In [4]:
def filter_str(str_cell):
str_cell = re.sub(r'[\W_]+', ' ', str(str_cell))
str_cell = str_cell.strip().lower()
if str_cell in ('1', '-1', '[]', 'nan', ''):
return None
else:
return str_cell
df[str_cols] = df[str_cols].astype(np.str).applymap(filter_str)
df[str_cols]
Out[4]:
In [35]:
str_desc = df[str_cols].describe()
str_desc = pd.DataFrame(str_desc, columns=sorted(str_desc.columns, key=lambda c: str_desc.loc['std', c]))
str_desc
Out[35]:
Column VAR_0044 has not a single value, drop it.
In [25]:
df.drop('VAR_0044', axis=1, inplace=True)
str_cols.remove('VAR_0044')
In [26]:
analyze_str_columns(['VAR_0202', 'VAR_0216', 'VAR_0222', 'VAR_0466'], df)
The values of these columns seem to be distributed according to the same distribution in the target column, so they're useless.
In [27]:
cols = ['VAR_0202', 'VAR_0216', 'VAR_0222', 'VAR_0466']
df.drop(cols, axis=1, inplace=True)
for c in cols:
str_cols.remove(c)
del cols
In [33]:
encoder = LabelEncoder()
for col in str_cols:
df[col] = encoder.fit_transform(df[col])
In [36]:
df.to_csv(PTRAIN_PATH)
In [39]:
with open('deleted_str_cols.pickle', 'wb') as fp:
pickle.dump(['VAR_0044', 'VAR_0202', 'VAR_0216', 'VAR_0222', 'VAR_0466'], fp)