In [1]:
import numpy as np
import pandas as pd
In [2]:
from os import listdir
from os.path import isfile, join
mypath = "D:/Kaggle_ws/Bosch/src/stations/train"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
only_notnull_date = [f for f in onlyfiles if f.find("notnull_date") > 0]
In [3]:
for f in sorted(only_notnull_date):
print f
date_df = pd.read_csv(join(mypath, f))
date_df.info()
L0S00_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 673862 entries, 0 to 673861
Data columns (total 13 columns):
Id 673862 non-null int64
L0_S0_D1 673862 non-null float64
L0_S0_D3 673862 non-null float64
L0_S0_D5 673862 non-null float64
L0_S0_D7 673862 non-null float64
L0_S0_D9 673862 non-null float64
L0_S0_D11 673862 non-null float64
L0_S0_D13 673862 non-null float64
L0_S0_D15 673862 non-null float64
L0_S0_D17 673862 non-null float64
L0_S0_D19 673862 non-null float64
L0_S0_D21 673862 non-null float64
L0_S0_D23 673862 non-null float64
dtypes: float64(12), int64(1)
memory usage: 66.8 MB
L0S01_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 673904 entries, 0 to 673903
Data columns (total 3 columns):
Id 673904 non-null int64
L0_S1_D26 673902 non-null float64
L0_S1_D30 673904 non-null float64
dtypes: float64(2), int64(1)
memory usage: 15.4 MB
L0S02_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339774 entries, 0 to 339773
Data columns (total 10 columns):
Id 339774 non-null int64
L0_S2_D34 339774 non-null float64
L0_S2_D38 339774 non-null float64
L0_S2_D42 339774 non-null float64
L0_S2_D46 339774 non-null float64
L0_S2_D50 339774 non-null float64
L0_S2_D54 339774 non-null float64
L0_S2_D58 339774 non-null float64
L0_S2_D62 339774 non-null float64
L0_S2_D66 339774 non-null float64
dtypes: float64(9), int64(1)
memory usage: 25.9 MB
L0S03_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334708 entries, 0 to 334707
Data columns (total 10 columns):
Id 334708 non-null int64
L0_S3_D70 334708 non-null float64
L0_S3_D74 334708 non-null float64
L0_S3_D78 334708 non-null float64
L0_S3_D82 334708 non-null float64
L0_S3_D86 334708 non-null float64
L0_S3_D90 334708 non-null float64
L0_S3_D94 334708 non-null float64
L0_S3_D98 334708 non-null float64
L0_S3_D102 334708 non-null float64
dtypes: float64(9), int64(1)
memory usage: 25.5 MB
L0S04_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335295 entries, 0 to 335294
Data columns (total 3 columns):
Id 335295 non-null int64
L0_S4_D106 335295 non-null float64
L0_S4_D111 335243 non-null float64
dtypes: float64(2), int64(1)
memory usage: 7.7 MB
L0S05_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339512 entries, 0 to 339511
Data columns (total 3 columns):
Id 339512 non-null int64
L0_S5_D115 339512 non-null float64
L0_S5_D117 339512 non-null float64
dtypes: float64(2), int64(1)
memory usage: 7.8 MB
L0S06_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338988 entries, 0 to 338987
Data columns (total 6 columns):
Id 338988 non-null int64
L0_S6_D120 338988 non-null float64
L0_S6_D124 338988 non-null float64
L0_S6_D127 338988 non-null float64
L0_S6_D130 338988 non-null float64
L0_S6_D134 338988 non-null float64
dtypes: float64(5), int64(1)
memory usage: 15.5 MB
L0S07_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335698 entries, 0 to 335697
Data columns (total 6 columns):
Id 335698 non-null int64
L0_S7_D137 335698 non-null float64
L0_S7_D139 335698 non-null float64
L0_S7_D140 335698 non-null float64
L0_S7_D141 335698 non-null float64
L0_S7_D143 335698 non-null float64
dtypes: float64(5), int64(1)
memory usage: 15.4 MB
L0S08_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 673881 entries, 0 to 673880
Data columns (total 5 columns):
Id 673881 non-null int64
L0_S8_D145 673881 non-null float64
L0_S8_D147 673881 non-null float64
L0_S8_D148 673881 non-null float64
L0_S8_D150 673881 non-null float64
dtypes: float64(4), int64(1)
memory usage: 25.7 MB
L0S09_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225679 entries, 0 to 225678
Data columns (total 14 columns):
Id 225679 non-null int64
L0_S9_D152 225679 non-null float64
L0_S9_D157 225659 non-null float64
L0_S9_D162 225678 non-null float64
L0_S9_D167 225671 non-null float64
L0_S9_D172 225678 non-null float64
L0_S9_D177 225659 non-null float64
L0_S9_D182 225659 non-null float64
L0_S9_D187 225659 non-null float64
L0_S9_D192 225659 non-null float64
L0_S9_D197 225659 non-null float64
L0_S9_D202 225659 non-null float64
L0_S9_D207 225659 non-null float64
L0_S9_D212 225659 non-null float64
dtypes: float64(13), int64(1)
memory usage: 24.1 MB
L0S10_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224540 entries, 0 to 224539
Data columns (total 14 columns):
Id 224540 non-null int64
L0_S10_D216 224540 non-null float64
L0_S10_D221 224523 non-null float64
L0_S10_D226 224540 non-null float64
L0_S10_D231 224527 non-null float64
L0_S10_D236 224540 non-null float64
L0_S10_D241 224523 non-null float64
L0_S10_D246 224523 non-null float64
L0_S10_D251 224523 non-null float64
L0_S10_D256 224523 non-null float64
L0_S10_D261 224523 non-null float64
L0_S10_D266 224523 non-null float64
L0_S10_D271 224523 non-null float64
L0_S10_D276 224523 non-null float64
dtypes: float64(13), int64(1)
memory usage: 24.0 MB
L0S11_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225452 entries, 0 to 225451
Data columns (total 14 columns):
Id 225452 non-null int64
L0_S11_D280 225452 non-null float64
L0_S11_D284 225452 non-null float64
L0_S11_D288 225452 non-null float64
L0_S11_D292 225452 non-null float64
L0_S11_D296 225452 non-null float64
L0_S11_D300 225452 non-null float64
L0_S11_D304 225452 non-null float64
L0_S11_D308 225452 non-null float64
L0_S11_D312 225452 non-null float64
L0_S11_D316 225452 non-null float64
L0_S11_D320 225452 non-null float64
L0_S11_D324 225452 non-null float64
L0_S11_D328 225452 non-null float64
dtypes: float64(13), int64(1)
memory usage: 24.1 MB
L0S12_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242061 entries, 0 to 242060
Data columns (total 13 columns):
Id 242061 non-null int64
L0_S12_D331 242061 non-null float64
L0_S12_D333 242061 non-null float64
L0_S12_D335 242061 non-null float64
L0_S12_D337 242061 non-null float64
L0_S12_D339 242061 non-null float64
L0_S12_D341 242061 non-null float64
L0_S12_D343 242061 non-null float64
L0_S12_D345 242061 non-null float64
L0_S12_D347 242061 non-null float64
L0_S12_D349 242061 non-null float64
L0_S12_D351 242061 non-null float64
L0_S12_D353 242061 non-null float64
dtypes: float64(12), int64(1)
memory usage: 24.0 MB
L0S13_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242065 entries, 0 to 242064
Data columns (total 3 columns):
Id 242065 non-null int64
L0_S13_D355 242065 non-null float64
L0_S13_D357 242065 non-null float64
dtypes: float64(2), int64(1)
memory usage: 5.5 MB
L0S14_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120625 entries, 0 to 120624
Data columns (total 10 columns):
Id 120625 non-null int64
L0_S14_D360 120625 non-null float64
L0_S14_D364 120625 non-null float64
L0_S14_D368 120625 non-null float64
L0_S14_D372 120625 non-null float64
L0_S14_D376 120625 non-null float64
L0_S14_D380 120625 non-null float64
L0_S14_D384 120625 non-null float64
L0_S14_D388 120625 non-null float64
L0_S14_D392 120625 non-null float64
dtypes: float64(9), int64(1)
memory usage: 9.2 MB
L0S15_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121445 entries, 0 to 121444
Data columns (total 10 columns):
Id 121445 non-null int64
L0_S15_D395 121445 non-null float64
L0_S15_D398 121445 non-null float64
L0_S15_D401 121445 non-null float64
L0_S15_D404 121445 non-null float64
L0_S15_D407 121445 non-null float64
L0_S15_D410 121445 non-null float64
L0_S15_D413 121445 non-null float64
L0_S15_D416 121445 non-null float64
L0_S15_D419 121445 non-null float64
dtypes: float64(9), int64(1)
memory usage: 9.3 MB
L0S16_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119139 entries, 0 to 119138
Data columns (total 3 columns):
Id 119139 non-null int64
L0_S16_D423 119139 non-null float64
L0_S16_D428 119139 non-null float64
dtypes: float64(2), int64(1)
memory usage: 2.7 MB
L0S17_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123027 entries, 0 to 123026
Data columns (total 3 columns):
Id 123027 non-null int64
L0_S17_D432 123027 non-null float64
L0_S17_D434 123027 non-null float64
dtypes: float64(2), int64(1)
memory usage: 2.8 MB
L0S18_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121081 entries, 0 to 121080
Data columns (total 6 columns):
Id 121081 non-null int64
L0_S18_D437 121081 non-null float64
L0_S18_D441 121081 non-null float64
L0_S18_D444 121081 non-null float64
L0_S18_D447 121081 non-null float64
L0_S18_D451 121081 non-null float64
dtypes: float64(5), int64(1)
memory usage: 5.5 MB
L0S19_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121027 entries, 0 to 121026
Data columns (total 6 columns):
Id 121027 non-null int64
L0_S19_D454 121027 non-null float64
L0_S19_D456 121027 non-null float64
L0_S19_D457 121027 non-null float64
L0_S19_D458 121027 non-null float64
L0_S19_D460 121027 non-null float64
dtypes: float64(5), int64(1)
memory usage: 5.5 MB
L0S20_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242111 entries, 0 to 242110
Data columns (total 5 columns):
Id 242111 non-null int64
L0_S20_D462 242111 non-null float64
L0_S20_D464 242111 non-null float64
L0_S20_D465 242111 non-null float64
L0_S20_D467 242111 non-null float64
dtypes: float64(4), int64(1)
memory usage: 9.2 MB
L0S21_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81409 entries, 0 to 81408
Data columns (total 16 columns):
Id 81409 non-null int64
L0_S21_D469 81409 non-null float64
L0_S21_D474 81368 non-null float64
L0_S21_D479 81409 non-null float64
L0_S21_D484 81373 non-null float64
L0_S21_D489 81409 non-null float64
L0_S21_D494 81368 non-null float64
L0_S21_D499 81368 non-null float64
L0_S21_D504 81368 non-null float64
L0_S21_D509 81368 non-null float64
L0_S21_D514 81368 non-null float64
L0_S21_D519 81368 non-null float64
L0_S21_D524 81368 non-null float64
L0_S21_D529 81368 non-null float64
L0_S21_D534 81368 non-null float64
L0_S21_D539 81368 non-null float64
dtypes: float64(15), int64(1)
memory usage: 9.9 MB
L0S22_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80601 entries, 0 to 80600
Data columns (total 16 columns):
Id 80601 non-null int64
L0_S22_D543 80600 non-null float64
L0_S22_D548 80591 non-null float64
L0_S22_D553 80599 non-null float64
L0_S22_D558 80595 non-null float64
L0_S22_D563 80599 non-null float64
L0_S22_D568 80591 non-null float64
L0_S22_D573 80591 non-null float64
L0_S22_D578 80591 non-null float64
L0_S22_D583 80591 non-null float64
L0_S22_D588 80591 non-null float64
L0_S22_D593 80591 non-null float64
L0_S22_D598 80591 non-null float64
L0_S22_D603 80591 non-null float64
L0_S22_D608 80591 non-null float64
L0_S22_D613 80591 non-null float64
dtypes: float64(15), int64(1)
memory usage: 9.8 MB
L0S23_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80290 entries, 0 to 80289
Data columns (total 16 columns):
Id 80290 non-null int64
L0_S23_D617 80290 non-null float64
L0_S23_D621 80290 non-null float64
L0_S23_D625 80290 non-null float64
L0_S23_D629 80290 non-null float64
L0_S23_D633 80290 non-null float64
L0_S23_D637 80290 non-null float64
L0_S23_D641 80290 non-null float64
L0_S23_D645 80290 non-null float64
L0_S23_D649 80290 non-null float64
L0_S23_D653 80290 non-null float64
L0_S23_D657 80290 non-null float64
L0_S23_D661 80290 non-null float64
L0_S23_D665 80290 non-null float64
L0_S23_D669 80290 non-null float64
L0_S23_D673 80290 non-null float64
dtypes: float64(15), int64(1)
memory usage: 9.8 MB
L1S24_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183727 entries, 0 to 183726
Columns: 289 entries, Id to L1_S24_D1851
dtypes: float64(288), int64(1)
memory usage: 405.1 MB
L1S25_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83658 entries, 0 to 83657
Columns: 334 entries, Id to L1_S25_D3035
dtypes: float64(333), int64(1)
memory usage: 213.2 MB
L2S26_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227011 entries, 0 to 227010
Data columns (total 27 columns):
Id 227011 non-null int64
L2_S26_D3037 227011 non-null float64
L2_S26_D3041 227011 non-null float64
L2_S26_D3044 227011 non-null float64
L2_S26_D3048 227011 non-null float64
L2_S26_D3052 227011 non-null float64
L2_S26_D3056 227011 non-null float64
L2_S26_D3059 227011 non-null float64
L2_S26_D3063 227011 non-null float64
L2_S26_D3066 227011 non-null float64
L2_S26_D3070 227011 non-null float64
L2_S26_D3074 227011 non-null float64
L2_S26_D3078 227011 non-null float64
L2_S26_D3081 227010 non-null float64
L2_S26_D3084 227010 non-null float64
L2_S26_D3087 227010 non-null float64
L2_S26_D3090 227010 non-null float64
L2_S26_D3093 227010 non-null float64
L2_S26_D3096 227010 non-null float64
L2_S26_D3100 227011 non-null float64
L2_S26_D3103 227011 non-null float64
L2_S26_D3107 227011 non-null float64
L2_S26_D3110 227011 non-null float64
L2_S26_D3114 227011 non-null float64
L2_S26_D3118 227011 non-null float64
L2_S26_D3122 227011 non-null float64
L2_S26_D3126 227011 non-null float64
dtypes: float64(26), int64(1)
memory usage: 46.8 MB
L2S27_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120729 entries, 0 to 120728
Data columns (total 27 columns):
Id 120729 non-null int64
L2_S27_D3130 120729 non-null float64
L2_S27_D3134 120729 non-null float64
L2_S27_D3137 120729 non-null float64
L2_S27_D3141 120729 non-null float64
L2_S27_D3145 120729 non-null float64
L2_S27_D3149 120729 non-null float64
L2_S27_D3152 120729 non-null float64
L2_S27_D3156 120729 non-null float64
L2_S27_D3159 120729 non-null float64
L2_S27_D3163 120729 non-null float64
L2_S27_D3167 120729 non-null float64
L2_S27_D3171 120729 non-null float64
L2_S27_D3174 120729 non-null float64
L2_S27_D3177 120729 non-null float64
L2_S27_D3180 120729 non-null float64
L2_S27_D3183 120729 non-null float64
L2_S27_D3186 120729 non-null float64
L2_S27_D3189 120729 non-null float64
L2_S27_D3193 120729 non-null float64
L2_S27_D3196 120729 non-null float64
L2_S27_D3200 120729 non-null float64
L2_S27_D3203 120729 non-null float64
L2_S27_D3207 120729 non-null float64
L2_S27_D3211 120729 non-null float64
L2_S27_D3215 120729 non-null float64
L2_S27_D3219 120729 non-null float64
dtypes: float64(26), int64(1)
memory usage: 24.9 MB
L2S28_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9583 entries, 0 to 9582
Data columns (total 27 columns):
Id 9583 non-null int64
L2_S28_D3223 9583 non-null float64
L2_S28_D3227 9583 non-null float64
L2_S28_D3230 9583 non-null float64
L2_S28_D3234 9583 non-null float64
L2_S28_D3238 9583 non-null float64
L2_S28_D3242 9583 non-null float64
L2_S28_D3245 9583 non-null float64
L2_S28_D3249 9583 non-null float64
L2_S28_D3252 9583 non-null float64
L2_S28_D3256 9583 non-null float64
L2_S28_D3260 9583 non-null float64
L2_S28_D3264 9583 non-null float64
L2_S28_D3267 9583 non-null float64
L2_S28_D3270 9583 non-null float64
L2_S28_D3273 9583 non-null float64
L2_S28_D3276 9583 non-null float64
L2_S28_D3279 9583 non-null float64
L2_S28_D3282 9583 non-null float64
L2_S28_D3286 9583 non-null float64
L2_S28_D3289 9583 non-null float64
L2_S28_D3293 9583 non-null float64
L2_S28_D3296 9583 non-null float64
L2_S28_D3300 9583 non-null float64
L2_S28_D3304 9583 non-null float64
L2_S28_D3308 9583 non-null float64
L2_S28_D3312 9583 non-null float64
dtypes: float64(26), int64(1)
memory usage: 2.0 MB
L3S29_notnull_date.csv
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-3-b1a0a79366be> in <module>()
1 for f in sorted(only_notnull_date):
2 print f
----> 3 date_df = pd.read_csv(join(mypath, f))
4 date_df.info()
d:\Anaconda\envs\Deep2\lib\site-packages\pandas\io\parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
653 skip_blank_lines=skip_blank_lines)
654
--> 655 return _read(filepath_or_buffer, kwds)
656
657 parser_f.__name__ = name
d:\Anaconda\envs\Deep2\lib\site-packages\pandas\io\parsers.pyc in _read(filepath_or_buffer, kwds)
409
410 try:
--> 411 data = parser.read(nrows)
412 finally:
413 parser.close()
d:\Anaconda\envs\Deep2\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows)
980 raise ValueError('skipfooter not supported for iteration')
981
--> 982 ret = self._engine.read(nrows)
983
984 if self.options.get('as_recarray'):
d:\Anaconda\envs\Deep2\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows)
1717 def read(self, nrows=None):
1718 try:
-> 1719 data = self._reader.read(nrows)
1720 except StopIteration:
1721 if self._first_chunk:
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read (pandas\_libs\parsers.c:10862)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory (pandas\_libs\parsers.c:11138)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows (pandas\_libs\parsers.c:12175)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_column_data (pandas\_libs\parsers.c:14136)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens (pandas\_libs\parsers.c:14858)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype (pandas\_libs\parsers.c:15629)()
d:\Anaconda\envs\Deep2\lib\site-packages\pandas\core\dtypes\common.pyc in is_integer_dtype(arr_or_dtype)
735
736
--> 737 def is_integer_dtype(arr_or_dtype):
738 """
739 Check whether the provided array or dtype is of an integer dtype.
KeyboardInterrupt:
In [3]:
l0s01_df = pd.read_csv(join(mypath,'L0S01_notnull_date.csv'), index_col=['Id'])
In [6]:
last_column_name = l0s01_df.columns[-1]
In [7]:
last_column_name
Out[7]:
'L0_S1_D30'
In [8]:
new_df = pd.DataFrame(columns=['Station', 'Time'], index=l0s01_df.index)
In [9]:
new_df['Station'] = 'L0S01'
new_df['Time'] = l0s01_df[last_column_name]
In [63]:
new_df.head()
Out[63]:
Station
Time
Id
4
L0S01
82.24
7
L0S01
1618.70
9
L0S01
1149.20
11
L0S01
602.64
13
L0S01
1331.66
In [10]:
new_df.sort_values(['Time']).head()
Out[10]:
Station
Time
Id
596800
L0S01
0.01
108193
L0S01
0.01
587799
L0S01
0.01
669517
L0S01
0.01
737264
L0S01
0.01
In [11]:
print l0s01_df[l0s01_df['L0_S1_D26'].isnull()]
print l0s01_df[l0s01_df['L0_S1_D30'].isnull()]
L0_S1_D26 L0_S1_D30
Id
2037603 NaN 572.21
2224683 NaN 1574.48
Empty DataFrame
Columns: [L0_S1_D26, L0_S1_D30]
Index: []
In [12]:
%time l0s01_df_copy = l0s01_df.T.fillna(l0s01_df.mean(axis=1)).T
Wall time: 1min 39s
In [13]:
print l0s01_df_copy[l0s01_df_copy['L0_S1_D26'].isnull()]
print l0s01_df_copy[l0s01_df_copy['L0_S1_D30'].isnull()]
Empty DataFrame
Columns: [L0_S1_D26, L0_S1_D30]
Index: []
Empty DataFrame
Columns: [L0_S1_D26, L0_S1_D30]
Index: []
In [14]:
row_means = l0s01_df.mean(axis=1)
In [18]:
for i, col in enumerate(l0s01_df):
# using i allows for duplicate columns
# inplace *may* not always work here, so IMO the next line is preferred
# df.iloc[:, i].fillna(m, inplace=True)
l0s01_df.iloc[:, i] = l0s01_df.iloc[:, i].fillna(row_means)
In [19]:
print l0s01_df[l0s01_df['L0_S1_D26'].isnull()]
print l0s01_df[l0s01_df['L0_S1_D30'].isnull()]
Empty DataFrame
Columns: [L0_S1_D26, L0_S1_D30]
Index: []
Empty DataFrame
Columns: [L0_S1_D26, L0_S1_D30]
Index: []
In [ ]:
In [27]:
l0s04_df = pd.read_csv(join(mypath,'L0S04_notnull_date.csv'), index_col=['Id'])
In [26]:
print l0s04_df[l0s04_df['L0_S4_D111'].isnull()].head()
Empty DataFrame
Columns: [L0_S4_D106, L0_S4_D111]
Index: []
In [28]:
def fillna_for_date_features(date_df):
row_means = date_df.mean(axis=1)
for i, col in enumerate(date_df):
# using i allows for duplicate columns
date_df.iloc[:, i] = date_df.iloc[:, i].fillna(row_means)
return date_df
In [31]:
l0s04_df = fillna_for_date_features(l0s04_df)
Wall time: 18 ms
In [30]:
print l0s04_df[l0s04_df['L0_S4_D111'].isnull()].head()
Empty DataFrame
Columns: [L0_S4_D106, L0_S4_D111]
Index: []
In [ ]:
Content source: zakkum42/Bosch
Similar notebooks: