notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:

    
import sys
sys.path.append('d:/Kaggle_ws/Bosch/src')



In [3]:

    
from include.feature_lists import numeric_missing_features
from include.dataset_fnames import generate_station_data_fname



In [4]:

    
station_id = 'L0S01'
numeric_missing_features[station_id]









    Out[4]:





['L0_S1_F24']



In [5]:

    
fname = generate_station_data_fname(station_id=station_id, sample_type='train', data_type='numeric')
features = ['Id'] + numeric_missing_features[station_id]



In [6]:

    
station_df = pd.read_csv(fname, usecols=features, index_col='Id')



In [7]:

    
station_df=station_df[station_df['L0_S1_F24'].notnull()]



In [8]:

    
sns.distplot(station_df['L0_S1_F24'])
plt.show()



In [10]:

    
from sklearn.neighbors.kde import KernelDensity

kde = KernelDensity(kernel='gaussian', bandwidth=0.001).fit(station_df['L0_S1_F24'])









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-10-e50757f4679d> in <module>()
      1 from sklearn.neighbors.kde import KernelDensity
      2 
----> 3 kde = KernelDensity(kernel='gaussian', bandwidth=0.001).fit(station_df['L0_S1_F24'])

d:\Anaconda\envs\Deep2\lib\site-packages\sklearn\neighbors\kde.pyc in fit(self, X, y)
    123         """
    124         algorithm = self._choose_algorithm(self.algorithm, self.metric)
--> 125         X = check_array(X, order='C', dtype=DTYPE)
    126 
    127         kwargs = self.metric_params

d:\Anaconda\envs\Deep2\lib\site-packages\sklearn\utils\validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    408                     "Reshape your data either using array.reshape(-1, 1) if "
    409                     "your data has a single feature or array.reshape(1, -1) "
--> 410                     "if it contains a single sample.".format(array))
    411             array = np.atleast_2d(array)
    412             # To ensure that array flags are maintained

ValueError: Expected 2D array, got 1D array instead:
array=[-0.271  0.057  0.147 ...,  0.057 -0.01   0.042].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.



In [ ]:

    
sns.distplot(kde.sample())
plt.show()



In [10]:

    
from sklearn.model_selection import GridSearchCV



In [21]:

    
X = np.reshape(station_df['L0_S1_F24'].values,(len(station_df['L0_S1_F24'].values),1))



In [ ]:

    
params = {'bandwidth': [0.1]}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(X)



In [ ]:

    
print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))



In [ ]:

    
np.logspace(-1, 1, 20)

Time?



In [11]:

    
from include.feature_lists import numeric_features



In [12]:

    
fname = generate_station_data_fname(station_id='L0S01', sample_type='train', data_type='numeric')
features = ['Id'] + numeric_features[station_id] + ['time']



In [13]:

    
station_df = pd.read_csv(fname, usecols=features, index_col='Id')



In [ ]:



In [ ]:



In [14]:

    
station_df.sort_values(['time'], axis=0, inplace=True, kind='mergesort')
station_df['row'] = range(len(station_df))



In [15]:

    
station_df[station_df['L0_S1_F24'].isnull()]



In [70]:

    
station_df.iloc[x-10:x+10]



In [16]:

    
idx = 218256
X = station_df.iloc[idx-100:idx+100]['time']
y = station_df.iloc[idx-100:idx+100]['L0_S1_F24']
plt.scatter(X, y, marker='o')
plt.axvline(x=station_df.iloc[idx]['time'], color='r')
plt.show()



In [17]:

    
idx = 614509
X = station_df.iloc[idx-100:idx+100]['time']
y = station_df.iloc[idx-100:idx+100]['L0_S1_F24']
plt.scatter(X, y, marker='o')
plt.axvline(x=station_df.iloc[idx]['time'], color='r')
plt.show()



In [18]:

    
idx = 614509
time = station_df.iloc[idx]['time']
X = station_df[station_df['time']==time]



In [19]:

    
X = X[X['L0_S1_F24'].notnull()]
X



In [141]:

    
plt.hist(X['L0_S1_F24'],50)
plt.show()



In [20]:

    
s=station_df.groupby(['time']).mean()
s['L0_S1_F24'].head()









    Out[20]:





time
0.01   -0.199286
0.02   -0.209474
0.06   -0.218667
0.07   -0.201769
0.08   -0.201111
Name: L0_S1_F24, dtype: float64



In [21]:

    
data = station_df.groupby(['time']).mean()
data = data['L0_S1_F24']



In [22]:

    
data2 = station_df.groupby(['time']).median()
data2 = data2['L0_S1_F24']



In [36]:

    
data









    Out[36]:





time
0.01      -0.199286
0.02      -0.209474
0.06      -0.218667
0.07      -0.201769
0.08      -0.201111
0.09      -0.215400
0.10      -0.207000
0.23      -0.199250
0.24      -0.206500
0.25      -0.209077
0.26      -0.198067
0.27      -0.204294
0.28      -0.204250
0.29      -0.203600
0.30      -0.201857
0.31      -0.210500
0.32      -0.204867
0.38      -0.194000
0.39      -0.207000
0.40      -0.206067
0.41      -0.206083
0.42      -0.208231
0.43      -0.205938
0.44      -0.201471
0.45      -0.176381
0.46      -0.203706
0.47      -0.207900
0.48      -0.200333
0.49      -0.207154
0.50      -0.210400
             ...   
1711.11    0.006429
1711.12   -0.000500
1711.13    0.000545
1711.16   -0.004200
1711.17   -0.001636
1711.18    0.000867
1711.19   -0.000118
1711.20   -0.004296
1711.21    0.006429
1711.22   -0.001222
1711.25   -0.001667
1711.26    0.002522
1711.27   -0.002400
1711.28   -0.005000
1711.29    0.001316
1711.30    0.006176
1711.31    0.000667
1711.50   -0.007000
1711.51   -0.007400
1711.52    0.002111
1713.59   -0.004261
1713.60   -0.000118
1713.61    0.004600
1713.62   -0.004250
1713.66    0.000200
1713.67   -0.008700
1713.68   -0.006769
1713.69    0.003929
1713.70   -0.004857
1713.71    0.012000
Name: L0_S1_F24, Length: 39654, dtype: float64



In [31]:

    
plt.plot(data, color='b')
plt.plot(data2, color='g')
# ['L0_S2_F24']
plt.axvline(x=station_df.iloc[218256]['time'], color='r')
plt.axvline(x=station_df.iloc[614509]['time'], color='r')
# plt.xlim(250,500)
# plt.ylim(-0.1,0)

plt.show()



In [38]:

    
%%time
from collections import Counter
mode = []
for i, f in station_df.groupby(['time']):
    c = Counter(f['L0_S1_F24'])
    mode.append(c.most_common(1)[0][0])
# print mode









    



Wall time: 8.07 s



In [39]:

    
len(data), len(data2), len(mode),









    Out[39]:





(39654, 39654, 39654)



In [40]:

    
print data.values[0:5]
print data2.values[0:5]
print mode[0:5]









    



[-0.19928571 -0.20947368 -0.21866667 -0.20176923 -0.20111111]
[-0.197  -0.21   -0.2175 -0.199  -0.204 ]
[-0.215, -0.22, -0.20399999999999999, -0.192, -0.20699999999999999]



In [55]:

    
data3 = pd.DataFrame(data)



In [57]:

    
data3['L0_S1_F24'] = mode



In [58]:

    
data3









    Out[58]:







  
    
      
      L0_S1_F24
    
    
      time
      
    
  
  
    
      0.01
      -0.215
    
    
      0.02
      -0.220
    
    
      0.06
      -0.204
    
    
      0.07
      -0.192
    
    
      0.08
      -0.207
    
    
      0.09
      -0.204
    
    
      0.10
      -0.194
    
    
      0.23
      -0.222
    
    
      0.24
      -0.210
    
    
      0.25
      -0.210
    
    
      0.26
      -0.199
    
    
      0.27
      -0.197
    
    
      0.28
      -0.184
    
    
      0.29
      -0.204
    
    
      0.30
      -0.204
    
    
      0.31
      -0.192
    
    
      0.32
      -0.212
    
    
      0.38
      -0.204
    
    
      0.39
      -0.192
    
    
      0.40
      -0.210
    
    
      0.41
      -0.204
    
    
      0.42
      -0.220
    
    
      0.43
      -0.220
    
    
      0.44
      -0.204
    
    
      0.45
      -0.184
    
    
      0.46
      -0.197
    
    
      0.47
      -0.215
    
    
      0.48
      -0.217
    
    
      0.49
      -0.228
    
    
      0.50
      -0.230
    
    
      ...
      ...
    
    
      1711.11
      0.011
    
    
      1711.12
      -0.012
    
    
      1711.13
      0.006
    
    
      1711.16
      -0.004
    
    
      1711.17
      0.006
    
    
      1711.18
      -0.007
    
    
      1711.19
      -0.004
    
    
      1711.20
      -0.004
    
    
      1711.21
      0.031
    
    
      1711.22
      0.001
    
    
      1711.25
      0.006
    
    
      1711.26
      0.001
    
    
      1711.27
      0.016
    
    
      1711.28
      -0.017
    
    
      1711.29
      0.019
    
    
      1711.30
      0.006
    
    
      1711.31
      0.024
    
    
      1711.50
      -0.007
    
    
      1711.51
      -0.015
    
    
      1711.52
      0.021
    
    
      1713.59
      0.011
    
    
      1713.60
      0.006
    
    
      1713.61
      -0.002
    
    
      1713.62
      0.001
    
    
      1713.66
      -0.022
    
    
      1713.67
      -0.035
    
    
      1713.68
      -0.010
    
    
      1713.69
      0.008
    
    
      1713.70
      -0.012
    
    
      1713.71
      0.016
    
  

39654 rows × 1 columns



In [60]:

    
plt.subplot(2,1,1)
plt.plot(data, color='b')
plt.axvline(x=station_df.iloc[218256]['time'], color='r')
plt.axvline(x=station_df.iloc[614509]['time'], color='r')
# plt.plot(data2.values, color='g')
plt.subplot(2,1,2)
plt.plot(data3, color='y')

plt.axvline(x=station_df.iloc[218256]['time'], color='r')
plt.axvline(x=station_df.iloc[614509]['time'], color='r')
plt.show()



In [ ]:

	L0_S1_F24	L0_S1_F28	time	row
Id
2037603	NaN	-0.408	572.21	218256
2224683	NaN	-0.408	1574.48	614509

	L0_S1_F24	L0_S1_F28	time	row
Id
1638489	-0.007	-0.034	572.21	218246
1640762	0.011	0.012	572.21	218247
1647144	0.021	-0.064	572.21	218248
1700624	0.013	-0.036	572.21	218249
1704087	0.021	0.023	572.21	218250
1705605	-0.010	-0.031	572.21	218251
1708060	-0.007	0.005	572.21	218252
1717073	0.021	-0.087	572.21	218253
1723331	0.013	-0.041	572.21	218254
1779193	0.024	0.003	572.21	218255
2037603	NaN	-0.408	572.21	218256
1138173	0.006	-0.051	572.22	218257
1149615	0.003	-0.068	572.22	218258
1151570	0.011	-0.036	572.22	218259
1195801	0.013	-0.020	572.22	218260
1210992	-0.012	-0.079	572.22	218261
1214708	-0.012	-0.013	572.22	218262
1228831	0.006	-0.041	572.22	218263
1292662	0.006	0.020	572.22	218264
1296460	0.029	-0.005	572.22	218265

	L0_S1_F24	L0_S1_F28	time	row
Id
206429	-0.128	0.078	1574.48	614492
446133	-0.097	0.243	1574.48	614493
588606	-0.128	0.203	1574.48	614494
599906	-0.081	0.003	1574.48	614495
655151	-0.102	0.178	1574.48	614496
658278	-0.097	0.211	1574.48	614497
664971	-0.112	-0.038	1574.48	614498
672238	-0.092	0.095	1574.48	614499
748657	-0.112	0.076	1574.48	614500
750935	-0.092	-0.027	1574.48	614501
812008	-0.135	-0.037	1574.48	614502
884114	-0.087	-0.030	1574.48	614503
890709	-0.102	-0.040	1574.48	614504
895169	-0.092	-0.071	1574.48	614505
1568653	-0.117	0.034	1574.48	614506
1651955	-0.092	0.080	1574.48	614507
2212278	-0.076	-0.005	1574.48	614508

	L0_S1_F24
time
0.01	-0.215
0.02	-0.220
0.06	-0.204
0.07	-0.192
0.08	-0.207
0.09	-0.204
0.10	-0.194
0.23	-0.222
0.24	-0.210
0.25	-0.210
0.26	-0.199
0.27	-0.197
0.28	-0.184
0.29	-0.204
0.30	-0.204
0.31	-0.192
0.32	-0.212
0.38	-0.204
0.39	-0.192
0.40	-0.210
0.41	-0.204
0.42	-0.220
0.43	-0.220
0.44	-0.204
0.45	-0.184
0.46	-0.197
0.47	-0.215
0.48	-0.217
0.49	-0.228
0.50	-0.230
...	...
1711.11	0.011
1711.12	-0.012
1711.13	0.006
1711.16	-0.004
1711.17	0.006
1711.18	-0.007
1711.19	-0.004
1711.20	-0.004
1711.21	0.031
1711.22	0.001
1711.25	0.006
1711.26	0.001
1711.27	0.016
1711.28	-0.017
1711.29	0.019
1711.30	0.006
1711.31	0.024
1711.50	-0.007
1711.51	-0.015
1711.52	0.021
1713.59	0.011
1713.60	0.006
1713.61	-0.002
1713.62	0.001
1713.66	-0.022
1713.67	-0.035
1713.68	-0.010
1713.69	0.008
1713.70	-0.012
1713.71	0.016