Data Imputation with Last Seen Value


In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics

from zoo.zouwu.preprocessing.impute.LastFill import LastFill


/home/xujing98/analytics-zoo/zoo/python_packages/sources/bigdl/util/engine.py:41: UserWarning: Find both SPARK_HOME and pyspark. You may need to check whether they match with each other. SPARK_HOME environment variable is set to: /home/xujing98/spark-2.4.3-bin-hadoop2.7, and pyspark is found in: /home/xujing98/anaconda3/envs/automl/lib/python3.6/site-packages/pyspark/__init__.py. If they are unmatched, please use one source only to avoid conflict. For example, you can unset SPARK_HOME and use pyspark only.
  warnings.warn(warning_msg)
/home/xujing98/analytics-zoo/pyzoo/zoo/util/engine.py:42: UserWarning: Find both SPARK_HOME and pyspark. You may need to check whether they match with each other. SPARK_HOME environment variable is set to: /home/xujing98/spark-2.4.3-bin-hadoop2.7, and pyspark is found in: /home/xujing98/anaconda3/envs/automl/lib/python3.6/site-packages/pyspark/__init__.py. If they are unmatched, you are recommended to use one source only to avoid conflict. For example, you can unset SPARK_HOME and use pyspark only.
  warnings.warn(warning_msg)

In [2]:
raw_df = pd.read_csv("data/data.csv")

In [3]:
df = pd.DataFrame(pd.to_datetime(raw_df.StartTime))

In [4]:
raw_df.AvgRate.str[-4:].unique()
# Unify AvgRate value
df['AvgRate'] = raw_df.AvgRate.apply(lambda x: float(x[:-4]) if x.endswith("Mbps") else float(x[:-4]) * 1000)
df["total"] = raw_df["total"]
df.set_index("StartTime", inplace=True)

In [5]:
df.head()


Out[5]:
AvgRate total
StartTime
2018-01-01 00:00:00 306.23 275605455598
2018-01-01 02:00:00 285.03 256527692256
2018-01-01 04:00:00 247.39 222652190823
2018-01-01 06:00:00 211.55 190396029658
2018-01-01 08:00:00 234.82 211340468977

In [6]:
sampled_df = df["2019-01-01 00:00:00":"2019-06-30 23:00:00"]

In [7]:
full_idx = pd.date_range(start=sampled_df.index.min(), end=sampled_df.index.max(), freq='2H')
sampled_df = df.reindex(full_idx)
print("no. of n/a values:")
print(sampled_df.isna().sum())


no. of n/a values:
AvgRate    0
total      0
dtype: int64

In [8]:
sampled_df


Out[8]:
AvgRate total
2019-01-01 00:00:00 271.53 244373500495
2019-01-01 02:00:00 208.49 187638154842
2019-01-01 04:00:00 175.88 158293382443
2019-01-01 06:00:00 229.20 206284366389
2019-01-01 08:00:00 264.95 238459494810
... ... ...
2019-06-30 14:00:00 618.90 557013671978
2019-06-30 16:00:00 533.31 479975450009
2019-06-30 18:00:00 475.86 428273046817
2019-06-30 20:00:00 448.12 403312442571
2019-06-30 22:00:00 512.81 461531722122

2172 rows × 2 columns


In [9]:
last_fill = LastFill()

In [10]:
full_idx = pd.date_range(start=df.index.min(), end=df.index.max(), freq='2H')
df1 = df.reindex(full_idx)
print("no. of n/a values:")
print(df1.isna().sum())


no. of n/a values:
AvgRate    327
total      327
dtype: int64

In [11]:
filled_df = last_fill.impute(df)

In [12]:
full_idx = pd.date_range(start=df.index.min(), end=df.index.max(), freq='2H')
fill_df = filled_df.reindex(full_idx)
print("no. of n/a values:")
print(filled_df.isna().sum())


no. of n/a values:
AvgRate    0
total      0
dtype: int64

In [13]:
mse_10 = last_fill.evaluate(sampled_df, 0.1)

In [14]:
mse_10


Out[14]:
[2004.2239946593002, 1.6857280722708464e+21]

In [15]:
mse_50 = last_fill.evaluate(sampled_df, 0.5)

In [16]:
mse_50


Out[16]:
[28298.18132730203, 2.4879356854775073e+22]

In [ ]: