In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sys
sys.path.append('d:/Kaggle_ws/Bosch/src')

In [3]:
from include.feature_lists import numeric_missing_features
from include.dataset_fnames import generate_station_data_fname

In [4]:
station_id = 'L0S01'
numeric_missing_features[station_id]


Out[4]:
['L0_S1_F24']

In [5]:
fname = generate_station_data_fname(station_id=station_id, sample_type='train', data_type='numeric')
features = ['Id'] + numeric_missing_features[station_id]

In [6]:
station_df = pd.read_csv(fname, usecols=features, index_col='Id')

In [7]:
station_df=station_df[station_df['L0_S1_F24'].notnull()]

In [8]:
sns.distplot(station_df['L0_S1_F24'])
plt.show()



In [10]:
from sklearn.neighbors.kde import KernelDensity

kde = KernelDensity(kernel='gaussian', bandwidth=0.001).fit(station_df['L0_S1_F24'])


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-10-e50757f4679d> in <module>()
      1 from sklearn.neighbors.kde import KernelDensity
      2 
----> 3 kde = KernelDensity(kernel='gaussian', bandwidth=0.001).fit(station_df['L0_S1_F24'])

d:\Anaconda\envs\Deep2\lib\site-packages\sklearn\neighbors\kde.pyc in fit(self, X, y)
    123         """
    124         algorithm = self._choose_algorithm(self.algorithm, self.metric)
--> 125         X = check_array(X, order='C', dtype=DTYPE)
    126 
    127         kwargs = self.metric_params

d:\Anaconda\envs\Deep2\lib\site-packages\sklearn\utils\validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    408                     "Reshape your data either using array.reshape(-1, 1) if "
    409                     "your data has a single feature or array.reshape(1, -1) "
--> 410                     "if it contains a single sample.".format(array))
    411             array = np.atleast_2d(array)
    412             # To ensure that array flags are maintained

ValueError: Expected 2D array, got 1D array instead:
array=[-0.271  0.057  0.147 ...,  0.057 -0.01   0.042].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [ ]:
sns.distplot(kde.sample())
plt.show()

In [10]:
from sklearn.model_selection import GridSearchCV

In [21]:
X = np.reshape(station_df['L0_S1_F24'].values,(len(station_df['L0_S1_F24'].values),1))

In [ ]:
params = {'bandwidth': [0.1]}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(X)

In [ ]:
print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

In [ ]:
np.logspace(-1, 1, 20)

Time?


In [11]:
from include.feature_lists import numeric_features

In [12]:
fname = generate_station_data_fname(station_id='L0S01', sample_type='train', data_type='numeric')
features = ['Id'] + numeric_features[station_id] + ['time']

In [13]:
station_df = pd.read_csv(fname, usecols=features, index_col='Id')

In [ ]:


In [ ]:


In [14]:
station_df.sort_values(['time'], axis=0, inplace=True, kind='mergesort')
station_df['row'] = range(len(station_df))

In [15]:
station_df[station_df['L0_S1_F24'].isnull()]


Out[15]:
L0_S1_F24 L0_S1_F28 time row
Id
2037603 NaN -0.408 572.21 218256
2224683 NaN -0.408 1574.48 614509

In [70]:
station_df.iloc[x-10:x+10]


Out[70]:
L0_S1_F24 L0_S1_F28 time row
Id
1638489 -0.007 -0.034 572.21 218246
1640762 0.011 0.012 572.21 218247
1647144 0.021 -0.064 572.21 218248
1700624 0.013 -0.036 572.21 218249
1704087 0.021 0.023 572.21 218250
1705605 -0.010 -0.031 572.21 218251
1708060 -0.007 0.005 572.21 218252
1717073 0.021 -0.087 572.21 218253
1723331 0.013 -0.041 572.21 218254
1779193 0.024 0.003 572.21 218255
2037603 NaN -0.408 572.21 218256
1138173 0.006 -0.051 572.22 218257
1149615 0.003 -0.068 572.22 218258
1151570 0.011 -0.036 572.22 218259
1195801 0.013 -0.020 572.22 218260
1210992 -0.012 -0.079 572.22 218261
1214708 -0.012 -0.013 572.22 218262
1228831 0.006 -0.041 572.22 218263
1292662 0.006 0.020 572.22 218264
1296460 0.029 -0.005 572.22 218265

In [16]:
idx = 218256
X = station_df.iloc[idx-100:idx+100]['time']
y = station_df.iloc[idx-100:idx+100]['L0_S1_F24']
plt.scatter(X, y, marker='o')
plt.axvline(x=station_df.iloc[idx]['time'], color='r')
plt.show()



In [17]:
idx = 614509
X = station_df.iloc[idx-100:idx+100]['time']
y = station_df.iloc[idx-100:idx+100]['L0_S1_F24']
plt.scatter(X, y, marker='o')
plt.axvline(x=station_df.iloc[idx]['time'], color='r')
plt.show()



In [18]:
idx = 614509
time = station_df.iloc[idx]['time']
X = station_df[station_df['time']==time]

In [19]:
X = X[X['L0_S1_F24'].notnull()]
X


Out[19]:
L0_S1_F24 L0_S1_F28 time row
Id
206429 -0.128 0.078 1574.48 614492
446133 -0.097 0.243 1574.48 614493
588606 -0.128 0.203 1574.48 614494
599906 -0.081 0.003 1574.48 614495
655151 -0.102 0.178 1574.48 614496
658278 -0.097 0.211 1574.48 614497
664971 -0.112 -0.038 1574.48 614498
672238 -0.092 0.095 1574.48 614499
748657 -0.112 0.076 1574.48 614500
750935 -0.092 -0.027 1574.48 614501
812008 -0.135 -0.037 1574.48 614502
884114 -0.087 -0.030 1574.48 614503
890709 -0.102 -0.040 1574.48 614504
895169 -0.092 -0.071 1574.48 614505
1568653 -0.117 0.034 1574.48 614506
1651955 -0.092 0.080 1574.48 614507
2212278 -0.076 -0.005 1574.48 614508

In [141]:
plt.hist(X['L0_S1_F24'],50)
plt.show()



In [20]:
s=station_df.groupby(['time']).mean()
s['L0_S1_F24'].head()


Out[20]:
time
0.01   -0.199286
0.02   -0.209474
0.06   -0.218667
0.07   -0.201769
0.08   -0.201111
Name: L0_S1_F24, dtype: float64

In [21]:
data = station_df.groupby(['time']).mean()
data = data['L0_S1_F24']

In [22]:
data2 = station_df.groupby(['time']).median()
data2 = data2['L0_S1_F24']

In [36]:
data


Out[36]:
time
0.01      -0.199286
0.02      -0.209474
0.06      -0.218667
0.07      -0.201769
0.08      -0.201111
0.09      -0.215400
0.10      -0.207000
0.23      -0.199250
0.24      -0.206500
0.25      -0.209077
0.26      -0.198067
0.27      -0.204294
0.28      -0.204250
0.29      -0.203600
0.30      -0.201857
0.31      -0.210500
0.32      -0.204867
0.38      -0.194000
0.39      -0.207000
0.40      -0.206067
0.41      -0.206083
0.42      -0.208231
0.43      -0.205938
0.44      -0.201471
0.45      -0.176381
0.46      -0.203706
0.47      -0.207900
0.48      -0.200333
0.49      -0.207154
0.50      -0.210400
             ...   
1711.11    0.006429
1711.12   -0.000500
1711.13    0.000545
1711.16   -0.004200
1711.17   -0.001636
1711.18    0.000867
1711.19   -0.000118
1711.20   -0.004296
1711.21    0.006429
1711.22   -0.001222
1711.25   -0.001667
1711.26    0.002522
1711.27   -0.002400
1711.28   -0.005000
1711.29    0.001316
1711.30    0.006176
1711.31    0.000667
1711.50   -0.007000
1711.51   -0.007400
1711.52    0.002111
1713.59   -0.004261
1713.60   -0.000118
1713.61    0.004600
1713.62   -0.004250
1713.66    0.000200
1713.67   -0.008700
1713.68   -0.006769
1713.69    0.003929
1713.70   -0.004857
1713.71    0.012000
Name: L0_S1_F24, Length: 39654, dtype: float64

In [31]:
plt.plot(data, color='b')
plt.plot(data2, color='g')
# ['L0_S2_F24']
plt.axvline(x=station_df.iloc[218256]['time'], color='r')
plt.axvline(x=station_df.iloc[614509]['time'], color='r')
# plt.xlim(250,500)
# plt.ylim(-0.1,0)

plt.show()



In [38]:
%%time
from collections import Counter
mode = []
for i, f in station_df.groupby(['time']):
    c = Counter(f['L0_S1_F24'])
    mode.append(c.most_common(1)[0][0])
# print mode


Wall time: 8.07 s

In [39]:
len(data), len(data2), len(mode),


Out[39]:
(39654, 39654, 39654)

In [40]:
print data.values[0:5]
print data2.values[0:5]
print mode[0:5]


[-0.19928571 -0.20947368 -0.21866667 -0.20176923 -0.20111111]
[-0.197  -0.21   -0.2175 -0.199  -0.204 ]
[-0.215, -0.22, -0.20399999999999999, -0.192, -0.20699999999999999]

In [55]:
data3 = pd.DataFrame(data)

In [57]:
data3['L0_S1_F24'] = mode

In [58]:
data3


Out[58]:
L0_S1_F24
time
0.01 -0.215
0.02 -0.220
0.06 -0.204
0.07 -0.192
0.08 -0.207
0.09 -0.204
0.10 -0.194
0.23 -0.222
0.24 -0.210
0.25 -0.210
0.26 -0.199
0.27 -0.197
0.28 -0.184
0.29 -0.204
0.30 -0.204
0.31 -0.192
0.32 -0.212
0.38 -0.204
0.39 -0.192
0.40 -0.210
0.41 -0.204
0.42 -0.220
0.43 -0.220
0.44 -0.204
0.45 -0.184
0.46 -0.197
0.47 -0.215
0.48 -0.217
0.49 -0.228
0.50 -0.230
... ...
1711.11 0.011
1711.12 -0.012
1711.13 0.006
1711.16 -0.004
1711.17 0.006
1711.18 -0.007
1711.19 -0.004
1711.20 -0.004
1711.21 0.031
1711.22 0.001
1711.25 0.006
1711.26 0.001
1711.27 0.016
1711.28 -0.017
1711.29 0.019
1711.30 0.006
1711.31 0.024
1711.50 -0.007
1711.51 -0.015
1711.52 0.021
1713.59 0.011
1713.60 0.006
1713.61 -0.002
1713.62 0.001
1713.66 -0.022
1713.67 -0.035
1713.68 -0.010
1713.69 0.008
1713.70 -0.012
1713.71 0.016

39654 rows × 1 columns


In [60]:
plt.subplot(2,1,1)
plt.plot(data, color='b')
plt.axvline(x=station_df.iloc[218256]['time'], color='r')
plt.axvline(x=station_df.iloc[614509]['time'], color='r')
# plt.plot(data2.values, color='g')
plt.subplot(2,1,2)
plt.plot(data3, color='y')

plt.axvline(x=station_df.iloc[218256]['time'], color='r')
plt.axvline(x=station_df.iloc[614509]['time'], color='r')
plt.show()



In [ ]: