Data has many nan values. Normally people will fill it with most common value for a classification variable or mean of some subclass for a continuous variable, but I will try to fill missed values using machine learning, treating them as target.
In [1]:
#import fillna
import sys
sys.path += ["src"]
import graphlab as gl
In [2]:
import fill_na_graphlab
In [3]:
#import dataframe to fill
In [4]:
import os
import pandas as pd
import numpy as np
# from sklearn.ensemble import RandomForestRegressor
In [5]:
#clf = RandomForestRegressor(n_estimators=100, n_jobs=3)
In [6]:
weather = pd.read_csv(os.path.join("data", "weather_modified_3.csv"))
In [13]:
weather_temp = weather.drop(["date",
"TSSN",
'SG',
'PRFG',
'GR',
'VCFG',
'GS',
'SQ',
'BLDU',
'PL',
'DU',
'FU',
'FZDZ',
'BLSN',
'MIFG',
'BCFG',
'FZRA'], 1)
In [14]:
weather_temp.head()
Out[14]:
In [15]:
a = weather_temp.var()
a.sort()
a
Out[15]:
In [21]:
for column in weather_temp.columns:
a = sum(weather_temp[column].isnull())
if a > 0:
print a, column
In [23]:
list(weather_temp.columns)
Out[23]:
In [6]:
features = [
# 'station_nbr',
'tmax',
'tmin',
'tavg',
'depart',
'dewpoint',
'wetbulb',
'heat',
'cool',
'sunrise',
'sunset',
'snowfall',
'preciptotal',
'stnpressure',
'sealevel',
'resultspeed',
'resultdir',
'avgspeed',
'HZ',
'UP',
'VCTS',
'DZ',
'BR',
'FG',
'TS',
'RA',
'FG+',
'TSRA',
'FZFG',
'SN',
'days']
In [ ]:
#Let's find columns that have only positive values
In [33]:
a = gl.SFrame(weather_temp)
for column in a.column_names():
a[column] = a[column].fillna(np.nan)
In [37]:
reload(fill_na_graphlab)
weather_result = fill_na_graphlab.fill_missed_all(a, features, verbose=True)
In [39]:
weather_result.head()
Out[39]:
In [40]:
weather_result["date"] = weather["date"]
In [46]:
weather_result = weather_result.to_dataframe()
In [45]:
weather_result[weather_result["sunset_hour"] < 0 ]
Out[45]:
In [46]:
weather_result[weather_result["sunset_hour"] > 23]
Out[46]:
In [47]:
weather_result[weather_result["sunset_hour"] < weather_result["sunrise_hour"]]
Out[47]:
In [49]:
weather_result["sunset_minute"][weather_result["sunset_minute"] < 0 ]
Out[49]:
In [50]:
weather_result["sunset_minute"][weather_result["sunset_minute"] < 0 ] = 0
In [52]:
weather_result[weather_result["sunset_minute"] > 60 ]
Out[52]:
In [47]:
weather_result[weather_result["day_length"] < 0]
Out[47]:
In [48]:
weather_result[weather_result["day_length"] > 24 * 60]
Out[48]:
In [43]:
weather_result[weather_result["stnpressure"] < 0 ]
Out[43]:
In [49]:
weather_result[weather_result["snowfall"] < 0 ]
Out[49]:
In [50]:
weather_result["snowfall"][weather_result["snowfall"] < 0 ] = 0
In [51]:
weather_result[weather_result["preciptotal"] < 0 ]
Out[51]:
In [59]:
weather_result["snowfall"][weather_result["snowfall"] < 0 ] = 0
In [52]:
# weather_result["resultdir"][weather_result["resultdir"] < 0]
weather_result["resultdir"][weather_result["resultdir"] < 0]
Out[52]:
In [54]:
weather_result["resultdir"][weather_result["resultdir"] > 355]
Out[54]:
In [55]:
weather_result["resultspeed"][weather_result["resultspeed"] < 0]
Out[55]:
In [56]:
weather_result["sealevel"][weather_result["sealevel"] < 0]
Out[56]:
In [57]:
weather_result["avgspeed"][weather_result["avgspeed"] < 0]
Out[57]:
In [21]:
weather_result[weather_result["log_resultspeed"] < 0]
Out[21]:
In [58]:
#save to file
weather_result.to_csv(os.path.join("data", "weather_filled_gl_default_3.csv"), index=False)
In [ ]: