In [2]:
import os
import re
import pickle
import time
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack
%matplotlib inline
# Custom modules
import const
import func
In [3]:
print const.TRAIN_FILES
print const.TEST_FILES
In [4]:
# Load lookup table
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.set_index('name_dat', inplace=True)
lut.head(3)
Out[4]:
In [5]:
# Load response
y = func.read_last_column(const.TRAIN_FILES[0] + '.csv')
y.head(3)
Out[5]:
In [6]:
# Load sample IDs
ID_train = func.read_first_column(const.TRAIN_FILES[0])
ID_test = func.read_first_column(const.TEST_FILES[0])
ID = pd.concat([ID_train, ID_test], axis=0)
print ID.shape
ID.head(3)
Out[6]:
In [7]:
# Load detrended numeric data
with open(os.path.join(const.DATA_PATH, 'feat_set_numeric_detrended.pkl'), 'rb') as f:
num_data = pickle.load(f)
In [8]:
print num_data.shape
num_data
Out[8]:
In [143]:
line_V2s = lut['line_V2'].unique()
line_cols = ['L' + str(x) + '_sum_num_dev' for x in line_V2s]
In [144]:
feat_num_sum = pd.DataFrame(columns=line_cols, index=ID.Id)
feat_num_mean = pd.DataFrame(columns=line_cols, index=ID.Id)
for line_V2, col in zip(line_V2s, line_cols):
print('Analyzing line {}'.format(line_V2))
num_cols = lut[lut['line_V2']==line_V2].col_num.values
num_cols = num_cols[~np.isnan(num_cols)]
feat_num_sum[col] = num_data[:,num_cols].sum(1).A1
feat_num_mean[col] = num_data[:,num_cols].mean(1).A1
In [148]:
# Store result
feat_num_sum.to_csv(os.path.join(const.DATA_PATH, 'feat_set_numeric_detrended_sum_lineV2.csv'),
index_label='ID')
In [149]:
feat_num_sum.replace(0,np.nan, inplace=True)
feat_num_mean.replace(0,np.nan, inplace=True)
feat_num_sum['R'] = y.Response
feat_num_mean['R'] = y.Response
In [150]:
feat_num_sum.sample(10)
Out[150]:
In [151]:
feat_num_mean.groupby('R').mean()
Out[151]:
In [152]:
feat_num_sum.groupby('R').mean()
Out[152]:
In [153]:
data0 = feat_num_sum[feat_num_sum['R']==0]
data1 = feat_num_sum[feat_num_sum['R']==1]
f, ax = plt.subplots(4,4, figsize=(16,16))
f.suptitle('Sum of deviation from mean per line V2')
n_bins = 50
for i, line_V2 in enumerate(line_cols):
ran = [feat_num_sum[line_V2].min()/2, feat_num_sum[line_V2].max()/2]
#ran = [-0.1, 0.1]
width = float((ran[1] - ran[0]))/n_bins
freq0, bins = np.histogram(data0[line_V2].values, bins=n_bins, density=True, range=ran)
freq1, bins = np.histogram(data1[line_V2].values, bins=n_bins, density=True, range=ran)
ax[i / 4, i % 4].bar(bins[1:], freq0, alpha=0.5, color='g', width=width)
ax[i / 4, i % 4].bar(bins[1:], freq1, alpha=0.5, color='r', width=width)
ax[i / 4, i % 4].set_title('Line V2: {}'.format(line_V2))
In [154]:
data0 = feat_num_mean[feat_num_mean['R']==0]
data1 = feat_num_mean[feat_num_mean['R']==1]
f, ax = plt.subplots(4,4, figsize=(16,16))
f.suptitle('Mean of deviation from mean per line V2')
n_bins = 50
for i, line_V2 in enumerate(line_cols):
ran = [feat_num_mean[line_V2].min()/2, feat_num_mean[line_V2].max()/2]
#ran = [-0.1, 0.1]
width = float((ran[1] - ran[0]))/n_bins
freq0, bins = np.histogram(data0[line_V2].values, bins=n_bins, density=True, range=ran)
freq1, bins = np.histogram(data1[line_V2].values, bins=n_bins, density=True, range=ran)
ax[i / 4, i % 4].bar(bins[1:], freq0, alpha=0.5, color='g', width=width)
ax[i / 4, i % 4].bar(bins[1:], freq1, alpha=0.5, color='r', width=width)
ax[i / 4, i % 4].set_title('Line V2: {}'.format(line_V2))
In [155]:
station_V2s = lut['station_V2'].unique()
station_cols = ['S' + str(x) + '_sum_num_dev' for x in station_V2s]
In [165]:
feat_num_sum = pd.DataFrame(columns=station_cols, index=ID.Id)
feat_num_mean = pd.DataFrame(columns=station_cols, index=ID.Id)
#feat_num_std = pd.DataFrame(columns=line_cols, index=ID.Id)
#feat_num_kurt = pd.DataFrame(columns=line_cols, index=ID.Id)
for station_V2, col in zip(station_V2s, station_cols):
print('Analyzing station V2 {}'.format(station_V2))
num_cols = lut[lut['station_V2']==station_V2].col_num.values
num_cols = num_cols[~np.isnan(num_cols)]
feat_num_sum[col] = num_data[:,num_cols].sum(1).A1
if num_data[:,num_cols].shape[1]>0:
feat_num_mean[col] = num_data[:,num_cols].mean(1).A1
In [1]:
feat_num_sum
In [166]:
# Store result
feat_num_sum.to_csv(os.path.join(const.DATA_PATH, 'feat_set_numeric_detrended_sum_stationV2.csv'),
index_label='ID')
In [167]:
feat_num_sum.replace(0,np.nan, inplace=True)
feat_num_mean.replace(0,np.nan, inplace=True)
feat_num_sum['R'] = y.Response
feat_num_mean['R'] = y.Response
In [168]:
data0 = feat_num_sum[feat_num_sum['R']==0]
data1 = feat_num_sum[feat_num_sum['R']==1]
n_bins = 50
for i, station_V2 in enumerate(station_cols):
plt.figure(figsize=(8,8))
ran = [feat_num_sum[station_V2].min(), feat_num_sum[station_V2].max()]
#ran = [-0.1, 0.1]
width = float((ran[1] - ran[0]))/n_bins
freq0, bins = np.histogram(data0[station_V2].values, bins=n_bins, density=True, range=ran)
freq1, bins = np.histogram(data1[station_V2].values, bins=n_bins, density=True, range=ran)
plt.bar(bins[1:], freq0, alpha=0.5, color='g', width=width)
plt.bar(bins[1:], freq1, alpha=0.5, color='r', width=width)
plt.title('Deviation Sums Station V2: {}'.format(station_V2))
plt.show()