Data has many nan values. Normally people will fill it with most common value for a classification variable or mean of some subclass for a continuous variable, but I will try to fill missed values using machine learning, treating them as target.


In [1]:
#import fillna 
import sys
sys.path += ["src"]
import graphlab as gl

In [2]:
import fill_na_graphlab

In [3]:
#import dataframe to fill

In [4]:
import os
import pandas as pd
import numpy as np
# from sklearn.ensemble import RandomForestRegressor

In [5]:
#clf = RandomForestRegressor(n_estimators=100, n_jobs=3)

In [6]:
weather = pd.read_csv(os.path.join("data", "weather_modified_3.csv"))

In [13]:
weather_temp = weather.drop(["date", 
                             "TSSN", 
                             'SG', 
                             'PRFG', 
                             'GR', 
                             'VCFG', 
                             'GS', 
                             'SQ', 
                             'BLDU', 
                             'PL',
                            'DU',
                            'FU',
                            'FZDZ',
                            'BLSN',
                            'MIFG',
                            'BCFG',
                            'FZRA'], 1)

In [14]:
weather_temp.head()


Out[14]:
station_nbr tmax tmin tavg depart dewpoint wetbulb heat cool sunrise ... DZ BR FG TS RA FG+ TSRA FZFG SN days
0 1 52 31 42 NaN 36 40 23 0 NaN ... 0 1 1 0 1 0 0 1 0 0
1 2 48 33 41 16 37 39 24 0 436 ... 0 0 0 0 1 0 0 0 0 0
2 3 55 34 45 9 24 36 20 0 455 ... 0 0 0 0 0 0 0 0 0 0
3 4 63 47 55 4 28 43 10 0 448 ... 0 0 0 0 0 0 0 0 0 0
4 6 63 34 49 0 31 43 16 0 447 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 31 columns


In [15]:
a = weather_temp.var()
a.sort()
a


Out[15]:
FZFG               0.010131
UP                 0.011085
DZ                 0.023638
VCTS               0.026919
sealevel           0.036532
FG+                0.039133
SN                 0.046719
TSRA               0.049958
FG                 0.059648
HZ                 0.067018
TS                 0.083557
preciptotal        0.118162
RA                 0.192189
BR                 0.213887
snowfall           0.249671
stnpressure        1.551055
avgspeed          15.472094
resultspeed       17.577701
station_nbr       33.275974
depart            58.472925
cool              61.384033
resultdir         93.618431
heat             191.652813
wetbulb          289.604567
tmin             366.303442
tavg             368.423355
dewpoint         376.379194
tmax             393.921316
sunrise         2950.082229
sunset          3031.578059
days           10323.527239
dtype: float64

In [21]:
for column in weather_temp.columns:
    a = sum(weather_temp[column].isnull()) 
    if a > 0:
        print a, column


906 tmax
908 tmin
1469 tavg
11511 depart
666 dewpoint
1252 wetbulb
1469 heat
1469 cool
9656 sunrise
9656 sunset
7224 snowfall
860 preciptotal
929 stnpressure
1724 sealevel
589 resultspeed
589 resultdir
875 avgspeed

In [23]:
list(weather_temp.columns)


Out[23]:
['station_nbr',
 'tmax',
 'tmin',
 'tavg',
 'depart',
 'dewpoint',
 'wetbulb',
 'heat',
 'cool',
 'sunrise',
 'sunset',
 'snowfall',
 'preciptotal',
 'stnpressure',
 'sealevel',
 'resultspeed',
 'resultdir',
 'avgspeed',
 'HZ',
 'UP',
 'VCTS',
 'DZ',
 'BR',
 'FG',
 'TS',
 'RA',
 'FG+',
 'TSRA',
 'FZFG',
 'SN',
 'days']

In [6]:
features = [
#     'station_nbr',
 'tmax',
 'tmin',
 'tavg',
 'depart',
 'dewpoint',
 'wetbulb',
 'heat',
 'cool',
 'sunrise',
 'sunset',
 'snowfall',
 'preciptotal',
 'stnpressure',
 'sealevel',
 'resultspeed',
 'resultdir',
 'avgspeed',
 'HZ',
 'UP',
 'VCTS',
 'DZ',
 'BR',
 'FG',
 'TS',
 'RA',
 'FG+',
 'TSRA',
 'FZFG',
 'SN',
 'days']

In [ ]:
#Let's find columns that have only positive values

In [33]:
a = gl.SFrame(weather_temp)
for column in a.column_names():
    a[column] = a[column].fillna(np.nan)

In [37]:
reload(fill_na_graphlab)

weather_result = fill_na_graphlab.fill_missed_all(a, features, verbose=True)


tmax 906
tmin 908
tavg 1469
depart 11511
dewpoint 666
wetbulb 1252
heat 1469
cool 1469
snowfall 7224
preciptotal 860
stnpressure 929
sealevel 1724
resultspeed 589
resultdir 589
avgspeed 875
HZ 0
FU 0
UP 0
VCTS 0
DZ 0
BR 0
FG 0
BCFG 0
DU 0
FZRA 0
TS 0
RA 0
PL 0
GS 0
GR 0
FZDZ 0
VCFG 0
PRFG 0
FG+ 0
TSRA 0
FZFG 0
BLDU 0
MIFG 0
SQ 0
BLSN 0
SN 0
SG 0
month 0
day 0
day_length
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19928
PROGRESS: Number of features          : 29
PROGRESS: Number of unpacked features : 29
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   1.593e+01        0.38s
PROGRESS:      1   1.299e+01        0.57s
PROGRESS:      2   1.126e+01        0.87s
PROGRESS:      3   1.029e+01        1.17s
PROGRESS:      4   9.770e+00        1.49s
PROGRESS:      5   9.484e+00        1.76s
PROGRESS:      6   9.336e+00        2.05s
PROGRESS:      7   9.255e+00        2.28s
PROGRESS:      8   9.212e+00        2.53s
PROGRESS:      9   9.174e+00        2.78s
 9656
['HZ', 'FU', 'UP', 'VCTS', 'DZ', 'BR', 'FG', 'BCFG', 'DU', 'FZRA', 'TS', 'RA', 'PL', 'GS', 'GR', 'FZDZ', 'VCFG', 'PRFG', 'FG+', 'TSRA', 'FZFG', 'BLDU', 'MIFG', 'SQ', 'BLSN', 'SN', 'SG', 'month', 'day']
filling resultdir na = 589
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19928
PROGRESS: Number of features          : 31
PROGRESS: Number of unpacked features : 31
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   5.796e+00        0.38s
PROGRESS:      1   4.885e+00        0.68s
PROGRESS:      2   4.357e+00        0.96s
PROGRESS:      3   4.064e+00        1.21s
PROGRESS:      4   3.898e+00        1.53s
PROGRESS:      5   3.800e+00        1.86s
PROGRESS:      6   3.749e+00        2.13s
PROGRESS:      7   3.714e+00        2.41s
PROGRESS:      8   3.699e+00        2.70s
PROGRESS:      9   3.686e+00        3.07s
                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19928
Number of feature columns     : 29
Number of unpacked features   : 29

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 9.1744
Validation RMSE               : None
Training time (sec)           : 2.8141

None
filling resultspeed na = 589

                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19928
Number of feature columns     : 31
Number of unpacked features   : 31

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 3.6864
Validation RMSE               : None
Training time (sec)           : 3.1361

None
filling dewpoint na = 666
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19851
PROGRESS: Number of features          : 33
PROGRESS: Number of unpacked features : 33
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   3.640e+01        0.58s
PROGRESS:      1   2.674e+01        1.10s
PROGRESS:      2   2.032e+01        1.60s
PROGRESS:      3   1.622e+01        2.10s
PROGRESS:      4   1.363e+01        2.53s
PROGRESS:      5   1.213e+01        2.90s
PROGRESS:      6   1.126e+01        3.23s
PROGRESS:      7   1.077e+01        3.55s
PROGRESS:      8   1.047e+01        3.80s
PROGRESS:      9   1.027e+01        4.11s

PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19657
PROGRESS: Number of features          : 35
PROGRESS: Number of unpacked features : 35
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   4.121e-01        0.40s
PROGRESS:      1   3.405e-01        0.73s
PROGRESS:      2   2.974e-01        1.05s
PROGRESS:      3   2.728e-01        1.48s
PROGRESS:      4   2.575e-01        1.90s
PROGRESS:      5   2.479e-01        2.12s
PROGRESS:      6   2.428e-01        2.38s
PROGRESS:      7   2.378e-01        2.75s
PROGRESS:      8   2.346e-01        2.99s
PROGRESS:      9   2.323e-01        3.25s
                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19851
Number of feature columns     : 33
Number of unpacked features   : 33

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 10.269
Validation RMSE               : None
Training time (sec)           : 4.1757

None
filling preciptotal na = 860
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19642
PROGRESS: Number of features          : 37
PROGRESS: Number of unpacked features : 37
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   5.981e+00        0.57s
PROGRESS:      1   4.325e+00        0.89s
PROGRESS:      2   3.209e+00        1.29s
PROGRESS:      3   2.477e+00        1.66s
PROGRESS:      4   2.016e+00        1.94s
PROGRESS:      5   1.742e+00        2.22s
PROGRESS:      6   1.582e+00        2.50s
PROGRESS:      7   1.494e+00        2.79s
PROGRESS:      8   1.444e+00        3.10s
PROGRESS:      9   1.412e+00        3.32s
                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19657
Number of feature columns     : 35
Number of unpacked features   : 35

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 0.2323
Validation RMSE               : None
Training time (sec)           : 3.2954

None
filling avgspeed na = 875
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19611
PROGRESS: Number of features          : 39
PROGRESS: Number of unpacked features : 39
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   5.201e+01        0.40s
PROGRESS:      1   3.682e+01        0.66s
PROGRESS:      2   2.630e+01        0.91s
PROGRESS:      3   1.909e+01        1.20s
PROGRESS:      4   1.423e+01        1.46s
PROGRESS:      5   1.103e+01        1.76s
PROGRESS:      6   9.010e+00        2.04s
PROGRESS:      7   7.780e+00        2.33s
PROGRESS:      8   7.036e+00        2.68s
PROGRESS:      9   6.610e+00        3.10s
                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19642
Number of feature columns     : 37
Number of unpacked features   : 37

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 1.4118
Validation RMSE               : None
Training time (sec)           : 3.358

None
filling tmax na = 906

                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19611
Number of feature columns     : 39
Number of unpacked features   : 39

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 6.6096
Validation RMSE               : None
Training time (sec)           : 3.161

None
filling tmin na = 908
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19609
PROGRESS: Number of features          : 41
PROGRESS: Number of unpacked features : 41
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   3.776e+01        0.57s
PROGRESS:      1   2.665e+01        1.01s
PROGRESS:      2   1.893e+01        1.41s
PROGRESS:      3   1.360e+01        1.79s
PROGRESS:      4   9.969e+00        2.23s
PROGRESS:      5   7.552e+00        2.61s
PROGRESS:      6   5.995e+00        2.96s
PROGRESS:      7   5.024e+00        3.32s
PROGRESS:      8   4.445e+00        3.62s
PROGRESS:      9   4.101e+00        3.90s

                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19609
Number of feature columns     : 41
Number of unpacked features   : 41

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 4.1007
Validation RMSE               : None
Training time (sec)           : 3.944

None
filling stnpressure na = 929
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19588
PROGRESS: Number of features          : 43
PROGRESS: Number of unpacked features : 43
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   2.010e+01        0.25s
PROGRESS:      1   1.410e+01        0.45s
PROGRESS:      2   9.897e+00        0.68s
PROGRESS:      3   6.968e+00        1.01s
PROGRESS:      4   4.924e+00        1.20s
PROGRESS:      5   3.510e+00        1.50s
PROGRESS:      6   2.536e+00        1.85s
PROGRESS:      7   1.877e+00        2.20s
PROGRESS:      8   1.441e+00        2.44s
PROGRESS:      9   1.166e+00        2.82s

PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19265
PROGRESS: Number of features          : 45
PROGRESS: Number of unpacked features : 45
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   3.927e+01        0.36s
PROGRESS:      1   2.753e+01        0.60s
PROGRESS:      2   1.931e+01        0.84s
PROGRESS:      3   1.355e+01        1.16s
PROGRESS:      4   9.522e+00        1.43s
PROGRESS:      5   6.708e+00        1.76s
PROGRESS:      6   4.748e+00        2.02s
PROGRESS:      7   3.391e+00        2.37s
PROGRESS:      8   2.460e+00        2.98s
PROGRESS:      9   1.833e+00        3.50s
                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19588
Number of feature columns     : 43
Number of unpacked features   : 43

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 1.1659
Validation RMSE               : None
Training time (sec)           : 2.8679

None
filling wetbulb na = 1252

                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19265
Number of feature columns     : 45
Number of unpacked features   : 45

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 1.8327
Validation RMSE               : None
Training time (sec)           : 3.571

None
filling cool na = 1469
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19048
PROGRESS: Number of features          : 47
PROGRESS: Number of unpacked features : 47
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   6.705e+00        0.58s
PROGRESS:      1   4.715e+00        0.95s
PROGRESS:      2   3.318e+00        1.33s
PROGRESS:      3   2.338e+00        1.70s
PROGRESS:      4   1.651e+00        1.96s
PROGRESS:      5   1.171e+00        2.27s
PROGRESS:      6   8.366e-01        2.68s
PROGRESS:      7   6.073e-01        3.07s
PROGRESS:      8   4.504e-01        3.45s
PROGRESS:      9   3.465e-01        3.83s

                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19048
Number of feature columns     : 47
Number of unpacked features   : 47

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 0.3465
Validation RMSE               : None
Training time (sec)           : 3.8813

None
filling heat na = 1469
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19048
PROGRESS: Number of features          : 49
PROGRESS: Number of unpacked features : 49
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   1.170e+01        0.57s
PROGRESS:      1   8.246e+00        0.91s
PROGRESS:      2   5.823e+00        1.23s
PROGRESS:      3   4.116e+00        1.51s
PROGRESS:      4   2.922e+00        1.95s
PROGRESS:      5   2.091e+00        2.36s
PROGRESS:      6   1.517e+00        2.83s
PROGRESS:      7   1.123e+00        3.17s
PROGRESS:      8   8.519e-01        3.46s
PROGRESS:      9   6.732e-01        3.75s

PROGRESS: WARNING: Detected extremely low variance for feature(s) 'heatNAN' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 19048
PROGRESS: Number of features          : 51
PROGRESS: Number of unpacked features : 51
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   4.461e+01        0.37s
PROGRESS:      1   3.125e+01        0.62s
PROGRESS:      2   2.188e+01        0.86s
PROGRESS:      3   1.533e+01        1.12s
PROGRESS:      4   1.073e+01        1.43s
PROGRESS:      5   7.519e+00        1.74s
PROGRESS:      6   5.267e+00        2.07s
PROGRESS:      7   3.690e+00        2.34s
PROGRESS:      8   2.586e+00        2.58s
PROGRESS:      9   1.813e+00        2.87s
                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19048
Number of feature columns     : 49
Number of unpacked features   : 49

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 0.6732
Validation RMSE               : None
Training time (sec)           : 3.7935

None
filling tavg na = 1469

                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 19048
Number of feature columns     : 51
Number of unpacked features   : 51

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 1.8126
Validation RMSE               : None
Training time (sec)           : 2.9332

None
filling sealevel na = 1724
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 18793
PROGRESS: Number of features          : 53
PROGRESS: Number of unpacked features : 53
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   2.066e+01        0.25s
PROGRESS:      1   1.446e+01        0.37s
PROGRESS:      2   1.012e+01        0.48s
PROGRESS:      3   7.088e+00        0.60s
PROGRESS:      4   4.964e+00        0.74s
PROGRESS:      5   3.477e+00        0.90s
PROGRESS:      6   2.437e+00        1.07s
PROGRESS:      7   1.709e+00        1.51s
PROGRESS:      8   1.201e+00        1.92s
PROGRESS:      9   8.468e-01        2.41s

                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 18793
Number of feature columns     : 53
Number of unpacked features   : 53

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 0.8468
Validation RMSE               : None
Training time (sec)           : 2.4866

None
filling snowfall na = 7224
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 13293
PROGRESS: Number of features          : 55
PROGRESS: Number of unpacked features : 55
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   4.949e-01        0.70s
PROGRESS:      1   3.666e-01        1.37s
PROGRESS:      2   2.766e-01        1.96s
PROGRESS:      3   2.121e-01        2.43s
PROGRESS:      4   1.666e-01        2.99s
PROGRESS:      5   1.335e-01        3.43s
PROGRESS:      6   1.098e-01        3.86s
PROGRESS:      7   9.371e-02        4.19s
PROGRESS:      8   8.113e-02        4.50s
PROGRESS:      9   7.239e-02        4.93s

                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 13293
Number of feature columns     : 55
Number of unpacked features   : 55

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 0.0724
Validation RMSE               : None
Training time (sec)           : 4.9776

None
filling day_length na = 9656
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 10861
PROGRESS: Number of features          : 57
PROGRESS: Number of unpacked features : 57
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   5.254e+02        0.25s
PROGRESS:      1   3.683e+02        0.33s
PROGRESS:      2   2.584e+02        0.56s
PROGRESS:      3   1.815e+02        0.78s
PROGRESS:      4   1.277e+02        1.00s
PROGRESS:      5   9.015e+01        1.25s
PROGRESS:      6   6.392e+01        1.54s
PROGRESS:      7   4.565e+01        1.81s
PROGRESS:      8   3.314e+01        2.13s
PROGRESS:      9   2.460e+01        2.46s

                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 10861
Number of feature columns     : 57
Number of unpacked features   : 57

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 24.6023
Validation RMSE               : None
Training time (sec)           : 2.5114

None
filling depart na = 11511
PROGRESS: WARNING: Detected extremely low variance for feature(s) 'heatNAN', 'tavgNAN' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
PROGRESS: Boosted trees regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 9006
PROGRESS: Number of features          : 59
PROGRESS: Number of unpacked features : 59
PROGRESS: Starting Boosted Trees
PROGRESS: --------------------------------------------------------
PROGRESS:   Iter        RMSE Elapsed time
PROGRESS:      0   6.587e+00        0.49s
PROGRESS:      1   5.855e+00        0.80s
PROGRESS:      2   5.246e+00        1.13s
PROGRESS:      3   4.838e+00        1.43s
PROGRESS:      4   4.546e+00        1.74s
PROGRESS:      5   4.259e+00        2.04s
PROGRESS:      6   4.116e+00        2.35s
PROGRESS:      7   3.926e+00        2.65s
PROGRESS:      8   3.785e+00        2.93s
PROGRESS:      9   3.669e+00        3.27s

                    Model summary                       
--------------------------------------------------------
Class                         : BoostedTreesRegression

Number of examples            : 9006
Number of feature columns     : 59
Number of unpacked features   : 59

Number of trees               : 10
Max tree depth                : 6
Train RMSE                    : 3.6689
Validation RMSE               : None
Training time (sec)           : 3.3023

None

In [39]:
weather_result.head()


Out[39]:
station_nbr tmax tmin tavg depart dewpoint wetbulb heat cool snowfall preciptotal stnpressure
2 48.0 33.0 41.0 16.0 37.0 39.0 24.0 0.0 0.0 0.07 28.82
3 55.0 34.0 45.0 9.0 24.0 36.0 20.0 0.0 0.0 0.0 29.77
4 63.0 47.0 55.0 4.0 28.0 43.0 10.0 0.0 0.0 0.0 29.79
6 63.0 34.0 49.0 0.0 31.0 43.0 16.0 0.0 0.0 0.0 29.95
11 72.0 48.0 60.0 7.0 54.0 56.0 5.0 0.0 0.0 0.0 30.15
14 50.0 34.0 42.0 5.0 25.0 35.0 23.0 0.0 0.0 0.0 29.13
15 48.0 26.0 37.0 16.0 35.0 38.0 28.0 0.0 0.0 0.09 29.53
18 59.0 40.0 50.0 4.0 28.0 40.0 15.0 0.0 0.0 0.0 29.98
19 38.0 25.0 32.0 10.0 26.0 30.0 33.0 0.0 0.5 0.12 29.06
2 46.0 28.0 37.0 12.0 24.0 32.0 28.0 0.0 0.01 0.01 28.51
sealevel resultspeed resultdir avgspeed HZ FU UP VCTS DZ BR FG BCFG DU FZRA TS RA PL
29.91 9.1 23.0 11.3 0 0 0 0 0 0 0 0 0 0 0 1 0
30.47 9.9 31.0 10.0 0 0 0 0 0 0 0 0 0 0 0 0 0
30.48 8.0 35.0 8.2 0 0 0 0 0 0 0 0 0 0 0 0 0
30.47 14.0 36.0 13.8 0 0 0 0 0 0 0 0 0 0 0 0 0
30.18 4.6 23.0 4.8 0 0 0 0 0 1 1 0 0 0 0 0 0
30.52 11.4 32.0 11.3 0 0 0 0 0 0 0 0 0 0 0 0 0
29.89 2.5 17.0 3.8 0 0 0 0 0 1 1 0 0 0 0 1 0
30.49 9.0 33.0 8.9 0 0 0 0 0 0 0 0 0 0 0 0 0
29.79 14.6 29.0 15.6 0 0 0 0 0 1 0 0 0 0 0 1 0
29.62 12.7 26.0 13.3 0 0 0 0 0 0 0 0 0 0 0 0 0
GS GR FZDZ VCFG PRFG FG+ TSRA FZFG BLDU MIFG SQ ...
0 0 0 0 0 0 0 0 0 0 0 ...
0 0 0 0 0 0 0 0 0 0 0 ...
0 0 0 0 0 0 0 0 0 0 0 ...
0 0 0 0 0 0 0 0 0 0 0 ...
0 0 0 0 0 1 0 0 0 0 0 ...
0 0 0 0 0 0 0 0 0 0 0 ...
0 0 0 0 0 1 0 1 0 0 0 ...
0 0 0 0 0 0 0 0 0 0 0 ...
0 0 0 0 0 0 0 0 0 0 0 ...
0 0 0 0 0 0 0 0 0 0 0 ...
[10 rows x 66 columns]

In [40]:
weather_result["date"] = weather["date"]

In [46]:
weather_result = weather_result.to_dataframe()

Constraints:

  • 23 >= sunset["hour"] >= sunrise["hour"] >= 0
  • 0 <= sunset["minutes"] <= 59
  • 0 <= sunrise["minutes"] <= 59
  • 24* 60 >= day_length >= 0
  • stnpressure >= 0
  • preciptotal >= 0
  • dewpoint >= 0
  • sealevel >= 0
  • resultspeed >= 0
  • 359 >= resultdir >=0
  • avgspeed >= 0

In [45]:
weather_result[weather_result["sunset_hour"] < 0 ]


Out[45]:
station_nbr HZ FU UP VCTS DZ BR FG BCFG DU ... tavg sealevel snowfall day_length sunrise_hour sunrise_minute sunset_hour sunset_minute depart date

0 rows × 51 columns


In [46]:
weather_result[weather_result["sunset_hour"] > 23]


Out[46]:
station_nbr HZ FU UP VCTS DZ BR FG BCFG DU ... tavg sealevel snowfall day_length sunrise_hour sunrise_minute sunset_hour sunset_minute depart date

0 rows × 51 columns


In [47]:
weather_result[weather_result["sunset_hour"] < weather_result["sunrise_hour"]]


Out[47]:
station_nbr HZ FU UP VCTS DZ BR FG BCFG DU ... tavg sealevel snowfall day_length sunrise_hour sunrise_minute sunset_hour sunset_minute depart date

0 rows × 51 columns


In [49]:
weather_result["sunset_minute"][weather_result["sunset_minute"] < 0 ]


Out[49]:
1553    -0.683154
1558    -0.351302
7788    -0.790381
15055   -0.070507
16017   -0.050679
16037   -0.501580
16052   -0.766174
Name: sunset_minute, dtype: float64

In [50]:
weather_result["sunset_minute"][weather_result["sunset_minute"] < 0 ] = 0

In [52]:
weather_result[weather_result["sunset_minute"] > 60 ]


Out[52]:
station_nbr HZ FU UP VCTS DZ BR FG BCFG DU ... tavg sealevel snowfall day_length sunrise_hour sunrise_minute sunset_hour sunset_minute depart date

0 rows × 51 columns


In [47]:
weather_result[weather_result["day_length"] < 0]


Out[47]:
station_nbr tmax tmin tavg depart dewpoint wetbulb heat cool snowfall ... stnpressureNAN wetbulbNAN coolNAN heatNAN tavgNAN sealevelNAN snowfallNAN day_lengthNAN departNAN date

0 rows × 67 columns


In [48]:
weather_result[weather_result["day_length"] > 24 * 60]


Out[48]:
station_nbr tmax tmin tavg depart dewpoint wetbulb heat cool snowfall ... stnpressureNAN wetbulbNAN coolNAN heatNAN tavgNAN sealevelNAN snowfallNAN day_lengthNAN departNAN date

0 rows × 67 columns


In [43]:
weather_result[weather_result["stnpressure"] < 0 ]


Out[43]:
station_nbr tmax tmin tavg depart dewpoint wetbulb heat cool snowfall preciptotal
stnpressure sealevel resultspeed resultdir avgspeed HZ FU UP VCTS DZ BR
FG BCFG DU FZRA TS RA PL GS GR FZDZ VCFG PRFG FG+ TSRA FZFG BLDU
MIFG SQ ...
[? rows x 67 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use len(sf) to force materialization.

In [49]:
weather_result[weather_result["snowfall"] < 0 ]


Out[49]:
station_nbr tmax tmin tavg depart dewpoint wetbulb heat cool snowfall ... stnpressureNAN wetbulbNAN coolNAN heatNAN tavgNAN sealevelNAN snowfallNAN day_lengthNAN departNAN date
16711 9 10 -6 2 -13.994639 -5 2 63 0 -0.038727 ... 0 0 0 0 0 0 1 1 1 2014-04-24
16787 9 11 2 7 -14.082035 -3 5 58 0 -0.038727 ... 0 0 0 0 0 0 0 1 1 2014-04-28
18184 9 8 -7 1 -14.764138 -7 0 64 0 -0.038727 ... 0 0 0 0 0 0 0 0 1 2014-07-07
18322 13 51 19 35 -1.531664 23 28 30 0 -0.028028 ... 0 0 0 0 0 1 0 0 1 2014-07-14
19637 8 74 64 69 8.498392 64 66 0 4 -0.024997 ... 0 0 0 0 0 0 1 1 1 2014-09-18

5 rows × 67 columns


In [50]:
weather_result["snowfall"][weather_result["snowfall"] < 0 ] = 0

In [51]:
weather_result[weather_result["preciptotal"] < 0 ]


Out[51]:
station_nbr tmax tmin tavg depart dewpoint wetbulb heat cool snowfall ... stnpressureNAN wetbulbNAN coolNAN heatNAN tavgNAN sealevelNAN snowfallNAN day_lengthNAN departNAN date

0 rows × 67 columns


In [59]:
weather_result["snowfall"][weather_result["snowfall"] < 0 ] = 0

In [52]:
# weather_result["resultdir"][weather_result["resultdir"] < 0]
weather_result["resultdir"][weather_result["resultdir"] < 0]


Out[52]:
Series([], name: resultdir, dtype: float64)

In [54]:
weather_result["resultdir"][weather_result["resultdir"] > 355]


Out[54]:
Series([], name: resultdir, dtype: float64)

In [55]:
weather_result["resultspeed"][weather_result["resultspeed"] < 0]


Out[55]:
Series([], name: resultspeed, dtype: float64)

In [56]:
weather_result["sealevel"][weather_result["sealevel"] < 0]


Out[56]:
Series([], name: sealevel, dtype: float64)

In [57]:
weather_result["avgspeed"][weather_result["avgspeed"] < 0]


Out[57]:
Series([], name: avgspeed, dtype: float64)

In [21]:
weather_result[weather_result["log_resultspeed"] < 0]


Out[21]:
station_nbr HZ FU UP VCTS DZ BR FG BCFG DU ... stnpressure wetbulb cool heat tavg sealevel log_snowfall day_length depart date

0 rows × 47 columns


In [58]:
#save to file
weather_result.to_csv(os.path.join("data", "weather_filled_gl_default_3.csv"), index=False)

In [ ]: