In [1]:
import numpy as np
import pandas as pd

In [2]:
from os import listdir
from os.path import isfile, join

mypath = "D:/Kaggle_ws/Bosch/src/stations/train"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
only_notnull_date = [f for f in onlyfiles if f.find("notnull_date") > 0]

In [3]:
for f in sorted(only_notnull_date):
    print f
    date_df = pd.read_csv(join(mypath, f))
    date_df.info()


L0S00_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 673862 entries, 0 to 673861
Data columns (total 13 columns):
Id           673862 non-null int64
L0_S0_D1     673862 non-null float64
L0_S0_D3     673862 non-null float64
L0_S0_D5     673862 non-null float64
L0_S0_D7     673862 non-null float64
L0_S0_D9     673862 non-null float64
L0_S0_D11    673862 non-null float64
L0_S0_D13    673862 non-null float64
L0_S0_D15    673862 non-null float64
L0_S0_D17    673862 non-null float64
L0_S0_D19    673862 non-null float64
L0_S0_D21    673862 non-null float64
L0_S0_D23    673862 non-null float64
dtypes: float64(12), int64(1)
memory usage: 66.8 MB
L0S01_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 673904 entries, 0 to 673903
Data columns (total 3 columns):
Id           673904 non-null int64
L0_S1_D26    673902 non-null float64
L0_S1_D30    673904 non-null float64
dtypes: float64(2), int64(1)
memory usage: 15.4 MB
L0S02_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339774 entries, 0 to 339773
Data columns (total 10 columns):
Id           339774 non-null int64
L0_S2_D34    339774 non-null float64
L0_S2_D38    339774 non-null float64
L0_S2_D42    339774 non-null float64
L0_S2_D46    339774 non-null float64
L0_S2_D50    339774 non-null float64
L0_S2_D54    339774 non-null float64
L0_S2_D58    339774 non-null float64
L0_S2_D62    339774 non-null float64
L0_S2_D66    339774 non-null float64
dtypes: float64(9), int64(1)
memory usage: 25.9 MB
L0S03_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334708 entries, 0 to 334707
Data columns (total 10 columns):
Id            334708 non-null int64
L0_S3_D70     334708 non-null float64
L0_S3_D74     334708 non-null float64
L0_S3_D78     334708 non-null float64
L0_S3_D82     334708 non-null float64
L0_S3_D86     334708 non-null float64
L0_S3_D90     334708 non-null float64
L0_S3_D94     334708 non-null float64
L0_S3_D98     334708 non-null float64
L0_S3_D102    334708 non-null float64
dtypes: float64(9), int64(1)
memory usage: 25.5 MB
L0S04_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335295 entries, 0 to 335294
Data columns (total 3 columns):
Id            335295 non-null int64
L0_S4_D106    335295 non-null float64
L0_S4_D111    335243 non-null float64
dtypes: float64(2), int64(1)
memory usage: 7.7 MB
L0S05_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339512 entries, 0 to 339511
Data columns (total 3 columns):
Id            339512 non-null int64
L0_S5_D115    339512 non-null float64
L0_S5_D117    339512 non-null float64
dtypes: float64(2), int64(1)
memory usage: 7.8 MB
L0S06_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338988 entries, 0 to 338987
Data columns (total 6 columns):
Id            338988 non-null int64
L0_S6_D120    338988 non-null float64
L0_S6_D124    338988 non-null float64
L0_S6_D127    338988 non-null float64
L0_S6_D130    338988 non-null float64
L0_S6_D134    338988 non-null float64
dtypes: float64(5), int64(1)
memory usage: 15.5 MB
L0S07_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335698 entries, 0 to 335697
Data columns (total 6 columns):
Id            335698 non-null int64
L0_S7_D137    335698 non-null float64
L0_S7_D139    335698 non-null float64
L0_S7_D140    335698 non-null float64
L0_S7_D141    335698 non-null float64
L0_S7_D143    335698 non-null float64
dtypes: float64(5), int64(1)
memory usage: 15.4 MB
L0S08_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 673881 entries, 0 to 673880
Data columns (total 5 columns):
Id            673881 non-null int64
L0_S8_D145    673881 non-null float64
L0_S8_D147    673881 non-null float64
L0_S8_D148    673881 non-null float64
L0_S8_D150    673881 non-null float64
dtypes: float64(4), int64(1)
memory usage: 25.7 MB
L0S09_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225679 entries, 0 to 225678
Data columns (total 14 columns):
Id            225679 non-null int64
L0_S9_D152    225679 non-null float64
L0_S9_D157    225659 non-null float64
L0_S9_D162    225678 non-null float64
L0_S9_D167    225671 non-null float64
L0_S9_D172    225678 non-null float64
L0_S9_D177    225659 non-null float64
L0_S9_D182    225659 non-null float64
L0_S9_D187    225659 non-null float64
L0_S9_D192    225659 non-null float64
L0_S9_D197    225659 non-null float64
L0_S9_D202    225659 non-null float64
L0_S9_D207    225659 non-null float64
L0_S9_D212    225659 non-null float64
dtypes: float64(13), int64(1)
memory usage: 24.1 MB
L0S10_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224540 entries, 0 to 224539
Data columns (total 14 columns):
Id             224540 non-null int64
L0_S10_D216    224540 non-null float64
L0_S10_D221    224523 non-null float64
L0_S10_D226    224540 non-null float64
L0_S10_D231    224527 non-null float64
L0_S10_D236    224540 non-null float64
L0_S10_D241    224523 non-null float64
L0_S10_D246    224523 non-null float64
L0_S10_D251    224523 non-null float64
L0_S10_D256    224523 non-null float64
L0_S10_D261    224523 non-null float64
L0_S10_D266    224523 non-null float64
L0_S10_D271    224523 non-null float64
L0_S10_D276    224523 non-null float64
dtypes: float64(13), int64(1)
memory usage: 24.0 MB
L0S11_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225452 entries, 0 to 225451
Data columns (total 14 columns):
Id             225452 non-null int64
L0_S11_D280    225452 non-null float64
L0_S11_D284    225452 non-null float64
L0_S11_D288    225452 non-null float64
L0_S11_D292    225452 non-null float64
L0_S11_D296    225452 non-null float64
L0_S11_D300    225452 non-null float64
L0_S11_D304    225452 non-null float64
L0_S11_D308    225452 non-null float64
L0_S11_D312    225452 non-null float64
L0_S11_D316    225452 non-null float64
L0_S11_D320    225452 non-null float64
L0_S11_D324    225452 non-null float64
L0_S11_D328    225452 non-null float64
dtypes: float64(13), int64(1)
memory usage: 24.1 MB
L0S12_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242061 entries, 0 to 242060
Data columns (total 13 columns):
Id             242061 non-null int64
L0_S12_D331    242061 non-null float64
L0_S12_D333    242061 non-null float64
L0_S12_D335    242061 non-null float64
L0_S12_D337    242061 non-null float64
L0_S12_D339    242061 non-null float64
L0_S12_D341    242061 non-null float64
L0_S12_D343    242061 non-null float64
L0_S12_D345    242061 non-null float64
L0_S12_D347    242061 non-null float64
L0_S12_D349    242061 non-null float64
L0_S12_D351    242061 non-null float64
L0_S12_D353    242061 non-null float64
dtypes: float64(12), int64(1)
memory usage: 24.0 MB
L0S13_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242065 entries, 0 to 242064
Data columns (total 3 columns):
Id             242065 non-null int64
L0_S13_D355    242065 non-null float64
L0_S13_D357    242065 non-null float64
dtypes: float64(2), int64(1)
memory usage: 5.5 MB
L0S14_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120625 entries, 0 to 120624
Data columns (total 10 columns):
Id             120625 non-null int64
L0_S14_D360    120625 non-null float64
L0_S14_D364    120625 non-null float64
L0_S14_D368    120625 non-null float64
L0_S14_D372    120625 non-null float64
L0_S14_D376    120625 non-null float64
L0_S14_D380    120625 non-null float64
L0_S14_D384    120625 non-null float64
L0_S14_D388    120625 non-null float64
L0_S14_D392    120625 non-null float64
dtypes: float64(9), int64(1)
memory usage: 9.2 MB
L0S15_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121445 entries, 0 to 121444
Data columns (total 10 columns):
Id             121445 non-null int64
L0_S15_D395    121445 non-null float64
L0_S15_D398    121445 non-null float64
L0_S15_D401    121445 non-null float64
L0_S15_D404    121445 non-null float64
L0_S15_D407    121445 non-null float64
L0_S15_D410    121445 non-null float64
L0_S15_D413    121445 non-null float64
L0_S15_D416    121445 non-null float64
L0_S15_D419    121445 non-null float64
dtypes: float64(9), int64(1)
memory usage: 9.3 MB
L0S16_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119139 entries, 0 to 119138
Data columns (total 3 columns):
Id             119139 non-null int64
L0_S16_D423    119139 non-null float64
L0_S16_D428    119139 non-null float64
dtypes: float64(2), int64(1)
memory usage: 2.7 MB
L0S17_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123027 entries, 0 to 123026
Data columns (total 3 columns):
Id             123027 non-null int64
L0_S17_D432    123027 non-null float64
L0_S17_D434    123027 non-null float64
dtypes: float64(2), int64(1)
memory usage: 2.8 MB
L0S18_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121081 entries, 0 to 121080
Data columns (total 6 columns):
Id             121081 non-null int64
L0_S18_D437    121081 non-null float64
L0_S18_D441    121081 non-null float64
L0_S18_D444    121081 non-null float64
L0_S18_D447    121081 non-null float64
L0_S18_D451    121081 non-null float64
dtypes: float64(5), int64(1)
memory usage: 5.5 MB
L0S19_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121027 entries, 0 to 121026
Data columns (total 6 columns):
Id             121027 non-null int64
L0_S19_D454    121027 non-null float64
L0_S19_D456    121027 non-null float64
L0_S19_D457    121027 non-null float64
L0_S19_D458    121027 non-null float64
L0_S19_D460    121027 non-null float64
dtypes: float64(5), int64(1)
memory usage: 5.5 MB
L0S20_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242111 entries, 0 to 242110
Data columns (total 5 columns):
Id             242111 non-null int64
L0_S20_D462    242111 non-null float64
L0_S20_D464    242111 non-null float64
L0_S20_D465    242111 non-null float64
L0_S20_D467    242111 non-null float64
dtypes: float64(4), int64(1)
memory usage: 9.2 MB
L0S21_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81409 entries, 0 to 81408
Data columns (total 16 columns):
Id             81409 non-null int64
L0_S21_D469    81409 non-null float64
L0_S21_D474    81368 non-null float64
L0_S21_D479    81409 non-null float64
L0_S21_D484    81373 non-null float64
L0_S21_D489    81409 non-null float64
L0_S21_D494    81368 non-null float64
L0_S21_D499    81368 non-null float64
L0_S21_D504    81368 non-null float64
L0_S21_D509    81368 non-null float64
L0_S21_D514    81368 non-null float64
L0_S21_D519    81368 non-null float64
L0_S21_D524    81368 non-null float64
L0_S21_D529    81368 non-null float64
L0_S21_D534    81368 non-null float64
L0_S21_D539    81368 non-null float64
dtypes: float64(15), int64(1)
memory usage: 9.9 MB
L0S22_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80601 entries, 0 to 80600
Data columns (total 16 columns):
Id             80601 non-null int64
L0_S22_D543    80600 non-null float64
L0_S22_D548    80591 non-null float64
L0_S22_D553    80599 non-null float64
L0_S22_D558    80595 non-null float64
L0_S22_D563    80599 non-null float64
L0_S22_D568    80591 non-null float64
L0_S22_D573    80591 non-null float64
L0_S22_D578    80591 non-null float64
L0_S22_D583    80591 non-null float64
L0_S22_D588    80591 non-null float64
L0_S22_D593    80591 non-null float64
L0_S22_D598    80591 non-null float64
L0_S22_D603    80591 non-null float64
L0_S22_D608    80591 non-null float64
L0_S22_D613    80591 non-null float64
dtypes: float64(15), int64(1)
memory usage: 9.8 MB
L0S23_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80290 entries, 0 to 80289
Data columns (total 16 columns):
Id             80290 non-null int64
L0_S23_D617    80290 non-null float64
L0_S23_D621    80290 non-null float64
L0_S23_D625    80290 non-null float64
L0_S23_D629    80290 non-null float64
L0_S23_D633    80290 non-null float64
L0_S23_D637    80290 non-null float64
L0_S23_D641    80290 non-null float64
L0_S23_D645    80290 non-null float64
L0_S23_D649    80290 non-null float64
L0_S23_D653    80290 non-null float64
L0_S23_D657    80290 non-null float64
L0_S23_D661    80290 non-null float64
L0_S23_D665    80290 non-null float64
L0_S23_D669    80290 non-null float64
L0_S23_D673    80290 non-null float64
dtypes: float64(15), int64(1)
memory usage: 9.8 MB
L1S24_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183727 entries, 0 to 183726
Columns: 289 entries, Id to L1_S24_D1851
dtypes: float64(288), int64(1)
memory usage: 405.1 MB
L1S25_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83658 entries, 0 to 83657
Columns: 334 entries, Id to L1_S25_D3035
dtypes: float64(333), int64(1)
memory usage: 213.2 MB
L2S26_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227011 entries, 0 to 227010
Data columns (total 27 columns):
Id              227011 non-null int64
L2_S26_D3037    227011 non-null float64
L2_S26_D3041    227011 non-null float64
L2_S26_D3044    227011 non-null float64
L2_S26_D3048    227011 non-null float64
L2_S26_D3052    227011 non-null float64
L2_S26_D3056    227011 non-null float64
L2_S26_D3059    227011 non-null float64
L2_S26_D3063    227011 non-null float64
L2_S26_D3066    227011 non-null float64
L2_S26_D3070    227011 non-null float64
L2_S26_D3074    227011 non-null float64
L2_S26_D3078    227011 non-null float64
L2_S26_D3081    227010 non-null float64
L2_S26_D3084    227010 non-null float64
L2_S26_D3087    227010 non-null float64
L2_S26_D3090    227010 non-null float64
L2_S26_D3093    227010 non-null float64
L2_S26_D3096    227010 non-null float64
L2_S26_D3100    227011 non-null float64
L2_S26_D3103    227011 non-null float64
L2_S26_D3107    227011 non-null float64
L2_S26_D3110    227011 non-null float64
L2_S26_D3114    227011 non-null float64
L2_S26_D3118    227011 non-null float64
L2_S26_D3122    227011 non-null float64
L2_S26_D3126    227011 non-null float64
dtypes: float64(26), int64(1)
memory usage: 46.8 MB
L2S27_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120729 entries, 0 to 120728
Data columns (total 27 columns):
Id              120729 non-null int64
L2_S27_D3130    120729 non-null float64
L2_S27_D3134    120729 non-null float64
L2_S27_D3137    120729 non-null float64
L2_S27_D3141    120729 non-null float64
L2_S27_D3145    120729 non-null float64
L2_S27_D3149    120729 non-null float64
L2_S27_D3152    120729 non-null float64
L2_S27_D3156    120729 non-null float64
L2_S27_D3159    120729 non-null float64
L2_S27_D3163    120729 non-null float64
L2_S27_D3167    120729 non-null float64
L2_S27_D3171    120729 non-null float64
L2_S27_D3174    120729 non-null float64
L2_S27_D3177    120729 non-null float64
L2_S27_D3180    120729 non-null float64
L2_S27_D3183    120729 non-null float64
L2_S27_D3186    120729 non-null float64
L2_S27_D3189    120729 non-null float64
L2_S27_D3193    120729 non-null float64
L2_S27_D3196    120729 non-null float64
L2_S27_D3200    120729 non-null float64
L2_S27_D3203    120729 non-null float64
L2_S27_D3207    120729 non-null float64
L2_S27_D3211    120729 non-null float64
L2_S27_D3215    120729 non-null float64
L2_S27_D3219    120729 non-null float64
dtypes: float64(26), int64(1)
memory usage: 24.9 MB
L2S28_notnull_date.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9583 entries, 0 to 9582
Data columns (total 27 columns):
Id              9583 non-null int64
L2_S28_D3223    9583 non-null float64
L2_S28_D3227    9583 non-null float64
L2_S28_D3230    9583 non-null float64
L2_S28_D3234    9583 non-null float64
L2_S28_D3238    9583 non-null float64
L2_S28_D3242    9583 non-null float64
L2_S28_D3245    9583 non-null float64
L2_S28_D3249    9583 non-null float64
L2_S28_D3252    9583 non-null float64
L2_S28_D3256    9583 non-null float64
L2_S28_D3260    9583 non-null float64
L2_S28_D3264    9583 non-null float64
L2_S28_D3267    9583 non-null float64
L2_S28_D3270    9583 non-null float64
L2_S28_D3273    9583 non-null float64
L2_S28_D3276    9583 non-null float64
L2_S28_D3279    9583 non-null float64
L2_S28_D3282    9583 non-null float64
L2_S28_D3286    9583 non-null float64
L2_S28_D3289    9583 non-null float64
L2_S28_D3293    9583 non-null float64
L2_S28_D3296    9583 non-null float64
L2_S28_D3300    9583 non-null float64
L2_S28_D3304    9583 non-null float64
L2_S28_D3308    9583 non-null float64
L2_S28_D3312    9583 non-null float64
dtypes: float64(26), int64(1)
memory usage: 2.0 MB
L3S29_notnull_date.csv
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-3-b1a0a79366be> in <module>()
      1 for f in sorted(only_notnull_date):
      2     print f
----> 3     date_df = pd.read_csv(join(mypath, f))
      4     date_df.info()

d:\Anaconda\envs\Deep2\lib\site-packages\pandas\io\parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

d:\Anaconda\envs\Deep2\lib\site-packages\pandas\io\parsers.pyc in _read(filepath_or_buffer, kwds)
    409 
    410     try:
--> 411         data = parser.read(nrows)
    412     finally:
    413         parser.close()

d:\Anaconda\envs\Deep2\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows)
    980                 raise ValueError('skipfooter not supported for iteration')
    981 
--> 982         ret = self._engine.read(nrows)
    983 
    984         if self.options.get('as_recarray'):

d:\Anaconda\envs\Deep2\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows)
   1717     def read(self, nrows=None):
   1718         try:
-> 1719             data = self._reader.read(nrows)
   1720         except StopIteration:
   1721             if self._first_chunk:

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read (pandas\_libs\parsers.c:10862)()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory (pandas\_libs\parsers.c:11138)()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows (pandas\_libs\parsers.c:12175)()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_column_data (pandas\_libs\parsers.c:14136)()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens (pandas\_libs\parsers.c:14858)()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype (pandas\_libs\parsers.c:15629)()

d:\Anaconda\envs\Deep2\lib\site-packages\pandas\core\dtypes\common.pyc in is_integer_dtype(arr_or_dtype)
    735 
    736 
--> 737 def is_integer_dtype(arr_or_dtype):
    738     """
    739     Check whether the provided array or dtype is of an integer dtype.

KeyboardInterrupt: 

In [3]:
l0s01_df = pd.read_csv(join(mypath,'L0S01_notnull_date.csv'), index_col=['Id'])

Item - Station - Time table


In [6]:
last_column_name = l0s01_df.columns[-1]

In [7]:
last_column_name


Out[7]:
'L0_S1_D30'

In [8]:
new_df = pd.DataFrame(columns=['Station', 'Time'], index=l0s01_df.index)

In [9]:
new_df['Station'] = 'L0S01'
new_df['Time'] = l0s01_df[last_column_name]

In [63]:
new_df.head()


Out[63]:
Station Time
Id
4 L0S01 82.24
7 L0S01 1618.70
9 L0S01 1149.20
11 L0S01 602.64
13 L0S01 1331.66

In [10]:
new_df.sort_values(['Time']).head()


Out[10]:
Station Time
Id
596800 L0S01 0.01
108193 L0S01 0.01
587799 L0S01 0.01
669517 L0S01 0.01
737264 L0S01 0.01

Dealing with null values


In [11]:
print l0s01_df[l0s01_df['L0_S1_D26'].isnull()]
print l0s01_df[l0s01_df['L0_S1_D30'].isnull()]


         L0_S1_D26  L0_S1_D30
Id                           
2037603        NaN     572.21
2224683        NaN    1574.48
Empty DataFrame
Columns: [L0_S1_D26, L0_S1_D30]
Index: []

Transpose method, slow


In [12]:
%time l0s01_df_copy = l0s01_df.T.fillna(l0s01_df.mean(axis=1)).T


Wall time: 1min 39s

In [13]:
print l0s01_df_copy[l0s01_df_copy['L0_S1_D26'].isnull()]
print l0s01_df_copy[l0s01_df_copy['L0_S1_D30'].isnull()]


Empty DataFrame
Columns: [L0_S1_D26, L0_S1_D30]
Index: []
Empty DataFrame
Columns: [L0_S1_D26, L0_S1_D30]
Index: []

Explicit row mean assignment


In [14]:
row_means = l0s01_df.mean(axis=1)

In [18]:
for i, col in enumerate(l0s01_df):
     # using i allows for duplicate columns
     # inplace *may* not always work here, so IMO the next line is preferred
     # df.iloc[:, i].fillna(m, inplace=True)
     l0s01_df.iloc[:, i] = l0s01_df.iloc[:, i].fillna(row_means)

In [19]:
print l0s01_df[l0s01_df['L0_S1_D26'].isnull()]
print l0s01_df[l0s01_df['L0_S1_D30'].isnull()]


Empty DataFrame
Columns: [L0_S1_D26, L0_S1_D30]
Index: []
Empty DataFrame
Columns: [L0_S1_D26, L0_S1_D30]
Index: []

In [ ]:


In [27]:
l0s04_df = pd.read_csv(join(mypath,'L0S04_notnull_date.csv'), index_col=['Id'])

In [26]:
print l0s04_df[l0s04_df['L0_S4_D111'].isnull()].head()


Empty DataFrame
Columns: [L0_S4_D106, L0_S4_D111]
Index: []

In [28]:
def fillna_for_date_features(date_df):
    row_means = date_df.mean(axis=1)  
    for i, col in enumerate(date_df):
         # using i allows for duplicate columns
         date_df.iloc[:, i] = date_df.iloc[:, i].fillna(row_means)
    return date_df

In [31]:
l0s04_df = fillna_for_date_features(l0s04_df)


Wall time: 18 ms

In [30]:
print l0s04_df[l0s04_df['L0_S4_D111'].isnull()].head()


Empty DataFrame
Columns: [L0_S4_D106, L0_S4_D111]
Index: []

In [ ]: