Data Imputation with Last Seen Value



In [1]:

    
import numpy as np
import pandas as pd
import sklearn.metrics as metrics

from zoo.zouwu.preprocessing.impute.LastFill import LastFill









    



/home/xujing98/analytics-zoo/zoo/python_packages/sources/bigdl/util/engine.py:41: UserWarning: Find both SPARK_HOME and pyspark. You may need to check whether they match with each other. SPARK_HOME environment variable is set to: /home/xujing98/spark-2.4.3-bin-hadoop2.7, and pyspark is found in: /home/xujing98/anaconda3/envs/automl/lib/python3.6/site-packages/pyspark/__init__.py. If they are unmatched, please use one source only to avoid conflict. For example, you can unset SPARK_HOME and use pyspark only.
  warnings.warn(warning_msg)
/home/xujing98/analytics-zoo/pyzoo/zoo/util/engine.py:42: UserWarning: Find both SPARK_HOME and pyspark. You may need to check whether they match with each other. SPARK_HOME environment variable is set to: /home/xujing98/spark-2.4.3-bin-hadoop2.7, and pyspark is found in: /home/xujing98/anaconda3/envs/automl/lib/python3.6/site-packages/pyspark/__init__.py. If they are unmatched, you are recommended to use one source only to avoid conflict. For example, you can unset SPARK_HOME and use pyspark only.
  warnings.warn(warning_msg)



In [2]:

    
raw_df = pd.read_csv("data/data.csv")



In [3]:

    
df = pd.DataFrame(pd.to_datetime(raw_df.StartTime))



In [4]:

    
raw_df.AvgRate.str[-4:].unique()
# Unify AvgRate value
df['AvgRate'] = raw_df.AvgRate.apply(lambda x: float(x[:-4]) if x.endswith("Mbps") else float(x[:-4]) * 1000)
df["total"] = raw_df["total"]
df.set_index("StartTime", inplace=True)



In [5]:

    
df.head()









    Out[5]:







  
    
      
      AvgRate
      total
    
    
      StartTime
      
      
    
  
  
    
      2018-01-01 00:00:00
      306.23
      275605455598
    
    
      2018-01-01 02:00:00
      285.03
      256527692256
    
    
      2018-01-01 04:00:00
      247.39
      222652190823
    
    
      2018-01-01 06:00:00
      211.55
      190396029658
    
    
      2018-01-01 08:00:00
      234.82
      211340468977



In [6]:

    
sampled_df = df["2019-01-01 00:00:00":"2019-06-30 23:00:00"]



In [7]:

    
full_idx = pd.date_range(start=sampled_df.index.min(), end=sampled_df.index.max(), freq='2H')
sampled_df = df.reindex(full_idx)
print("no. of n/a values:")
print(sampled_df.isna().sum())









    



no. of n/a values:
AvgRate    0
total      0
dtype: int64



In [8]:

    
sampled_df









    Out[8]:







  
    
      
      AvgRate
      total
    
  
  
    
      2019-01-01 00:00:00
      271.53
      244373500495
    
    
      2019-01-01 02:00:00
      208.49
      187638154842
    
    
      2019-01-01 04:00:00
      175.88
      158293382443
    
    
      2019-01-01 06:00:00
      229.20
      206284366389
    
    
      2019-01-01 08:00:00
      264.95
      238459494810
    
    
      ...
      ...
      ...
    
    
      2019-06-30 14:00:00
      618.90
      557013671978
    
    
      2019-06-30 16:00:00
      533.31
      479975450009
    
    
      2019-06-30 18:00:00
      475.86
      428273046817
    
    
      2019-06-30 20:00:00
      448.12
      403312442571
    
    
      2019-06-30 22:00:00
      512.81
      461531722122
    
  

2172 rows × 2 columns



In [9]:

    
last_fill = LastFill()



In [10]:

    
full_idx = pd.date_range(start=df.index.min(), end=df.index.max(), freq='2H')
df1 = df.reindex(full_idx)
print("no. of n/a values:")
print(df1.isna().sum())









    



no. of n/a values:
AvgRate    327
total      327
dtype: int64



In [11]:

    
filled_df = last_fill.impute(df)



In [12]:

    
full_idx = pd.date_range(start=df.index.min(), end=df.index.max(), freq='2H')
fill_df = filled_df.reindex(full_idx)
print("no. of n/a values:")
print(filled_df.isna().sum())









    



no. of n/a values:
AvgRate    0
total      0
dtype: int64



In [13]:

    
mse_10 = last_fill.evaluate(sampled_df, 0.1)



In [14]:

    
mse_10









    Out[14]:





[2004.2239946593002, 1.6857280722708464e+21]



In [15]:

    
mse_50 = last_fill.evaluate(sampled_df, 0.5)



In [16]:

    
mse_50









    Out[16]:





[28298.18132730203, 2.4879356854775073e+22]



In [ ]:

	AvgRate	total
StartTime
2018-01-01 00:00:00	306.23	275605455598
2018-01-01 02:00:00	285.03	256527692256
2018-01-01 04:00:00	247.39	222652190823
2018-01-01 06:00:00	211.55	190396029658
2018-01-01 08:00:00	234.82	211340468977

	AvgRate	total
2019-01-01 00:00:00	271.53	244373500495
2019-01-01 02:00:00	208.49	187638154842
2019-01-01 04:00:00	175.88	158293382443
2019-01-01 06:00:00	229.20	206284366389
2019-01-01 08:00:00	264.95	238459494810
...	...	...
2019-06-30 14:00:00	618.90	557013671978
2019-06-30 16:00:00	533.31	479975450009
2019-06-30 18:00:00	475.86	428273046817
2019-06-30 20:00:00	448.12	403312442571
2019-06-30 22:00:00	512.81	461531722122