In [ ]:
import sys
sys.path.append("d:/Kaggle_ws/Bosch/src")
In [ ]:
from include.dataset_fnames import generate_station_data_fname
from include.feature_lists import numeric_features
In [96]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [ ]:
import pandas as pd
import numpy as np
In [ ]:
s01_fname = generate_station_data_fname(station_id="L0S01", sample_type='test', data_type='numeric')
print s01_fname
In [ ]:
s01_df = pd.read_csv(s01_fname, index_col=['Id'])
In [ ]:
t = s01_df.isnull().any()
In [ ]:
for f in t.index:
if (t.loc[f]==True):
print f
In [ ]:
for station_id in sorted(numeric_features):
print "Station:", station_id
s_fname = generate_station_data_fname(station_id=station_id, sample_type='train', data_type='numeric')
s_df = pd.read_csv(s_fname, index_col=['Id'])
null_state = s_df.isnull().any()
for ft in null_state.index:
if (null_state.loc[ft]==True):
print ft
del s_fname
del s_df
del null_state
In [105]:
def impute_feature(df, feature):
new_df = df.copy()
print new_df.corr()
sns.distplot(new_df['L0_S1_F24'])
In [103]:
def find_missing_features(station_id):
print "Station:", station_id
s_fname = generate_station_data_fname(station_id=station_id, sample_type='train', data_type='numeric')
s_df = pd.read_csv(s_fname, index_col=['Id'])
null_state = s_df.isnull().any()
for ft in null_state.index:
if (null_state.loc[ft]==True):
print ft
impute_feature(s_df, ft)
del s_fname
del s_df
del null_state
In [106]:
find_missing_features('L0S01')
In [ ]:
s01_df.info()
In [ ]:
from include.feature_lists import *
In [ ]:
def p(l):
print len(l), l
return len(l)
In [ ]:
l = 0
for f in (L0S01_numeric, L0S09_numeric, L0S10_numeric, L0S21_numeric, L0S22_numeric, # L1S24_numeric, L1S25_numeric, \
L3S29_numeric, L3S30_numeric, L3S33_numeric, L3S35_numeric, L3S36_numeric, L3S40_numeric, L3S43_numeric, \
L3S48_numeric, L3S49_numeric):
l +=p (f)
print l
In [ ]:
s48_df = pd.read_csv(generate_station_data_fname(station_id='L3S48', sample_type='train', data_type='numeric'), index_col=['Id'])
In [ ]:
null_state = s48_df.isnull().any()
for ft in null_state.index:
if (null_state.loc[ft]==True):
print ft
In [ ]: