In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
import sys
sys.path.append('d:/Kaggle_ws/Bosch/src')
In [3]:
from include.feature_lists import numeric_missing_features
from include.dataset_fnames import generate_station_data_fname
In [4]:
station_id = 'L0S01'
numeric_missing_features[station_id]
Out[4]:
In [5]:
fname = generate_station_data_fname(station_id=station_id, sample_type='train', data_type='numeric')
features = ['Id'] + numeric_missing_features[station_id]
In [6]:
station_df = pd.read_csv(fname, usecols=features, index_col='Id')
In [7]:
station_df=station_df[station_df['L0_S1_F24'].notnull()]
In [8]:
sns.distplot(station_df['L0_S1_F24'])
plt.show()
In [10]:
from sklearn.neighbors.kde import KernelDensity
kde = KernelDensity(kernel='gaussian', bandwidth=0.001).fit(station_df['L0_S1_F24'])
In [ ]:
sns.distplot(kde.sample())
plt.show()
In [10]:
from sklearn.model_selection import GridSearchCV
In [21]:
X = np.reshape(station_df['L0_S1_F24'].values,(len(station_df['L0_S1_F24'].values),1))
In [ ]:
params = {'bandwidth': [0.1]}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(X)
In [ ]:
print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
In [ ]:
np.logspace(-1, 1, 20)
In [11]:
from include.feature_lists import numeric_features
In [12]:
fname = generate_station_data_fname(station_id='L0S01', sample_type='train', data_type='numeric')
features = ['Id'] + numeric_features[station_id] + ['time']
In [13]:
station_df = pd.read_csv(fname, usecols=features, index_col='Id')
In [ ]:
In [ ]:
In [14]:
station_df.sort_values(['time'], axis=0, inplace=True, kind='mergesort')
station_df['row'] = range(len(station_df))
In [15]:
station_df[station_df['L0_S1_F24'].isnull()]
Out[15]:
In [70]:
station_df.iloc[x-10:x+10]
Out[70]:
In [16]:
idx = 218256
X = station_df.iloc[idx-100:idx+100]['time']
y = station_df.iloc[idx-100:idx+100]['L0_S1_F24']
plt.scatter(X, y, marker='o')
plt.axvline(x=station_df.iloc[idx]['time'], color='r')
plt.show()
In [17]:
idx = 614509
X = station_df.iloc[idx-100:idx+100]['time']
y = station_df.iloc[idx-100:idx+100]['L0_S1_F24']
plt.scatter(X, y, marker='o')
plt.axvline(x=station_df.iloc[idx]['time'], color='r')
plt.show()
In [18]:
idx = 614509
time = station_df.iloc[idx]['time']
X = station_df[station_df['time']==time]
In [19]:
X = X[X['L0_S1_F24'].notnull()]
X
Out[19]:
In [141]:
plt.hist(X['L0_S1_F24'],50)
plt.show()
In [20]:
s=station_df.groupby(['time']).mean()
s['L0_S1_F24'].head()
Out[20]:
In [21]:
data = station_df.groupby(['time']).mean()
data = data['L0_S1_F24']
In [22]:
data2 = station_df.groupby(['time']).median()
data2 = data2['L0_S1_F24']
In [36]:
data
Out[36]:
In [31]:
plt.plot(data, color='b')
plt.plot(data2, color='g')
# ['L0_S2_F24']
plt.axvline(x=station_df.iloc[218256]['time'], color='r')
plt.axvline(x=station_df.iloc[614509]['time'], color='r')
# plt.xlim(250,500)
# plt.ylim(-0.1,0)
plt.show()
In [38]:
%%time
from collections import Counter
mode = []
for i, f in station_df.groupby(['time']):
c = Counter(f['L0_S1_F24'])
mode.append(c.most_common(1)[0][0])
# print mode
In [39]:
len(data), len(data2), len(mode),
Out[39]:
In [40]:
print data.values[0:5]
print data2.values[0:5]
print mode[0:5]
In [55]:
data3 = pd.DataFrame(data)
In [57]:
data3['L0_S1_F24'] = mode
In [58]:
data3
Out[58]:
In [60]:
plt.subplot(2,1,1)
plt.plot(data, color='b')
plt.axvline(x=station_df.iloc[218256]['time'], color='r')
plt.axvline(x=station_df.iloc[614509]['time'], color='r')
# plt.plot(data2.values, color='g')
plt.subplot(2,1,2)
plt.plot(data3, color='y')
plt.axvline(x=station_df.iloc[218256]['time'], color='r')
plt.axvline(x=station_df.iloc[614509]['time'], color='r')
plt.show()
In [ ]: