Test TU_Util()


In [17]:
import matplotlib 
from matplotlib import style
%matplotlib inline
style.use('ggplot')

# This is to import custom-made modules
# This can be removed after making these modules a real library
import os, sys
lib_path = os.path.abspath(os.path.join('..', 'building-analytics')) # relative path of the source code in Box Folder 
sys.path.append(lib_path)

from TS_Util_Clean_Data import *

# inputs
fileName = "test_marco.csv" # replace with other files used
folder = "../test1"

## call script

# instantiate class
TSU = TS_Util()

# load data
data= TSU.load_TS(fileName, folder)

data.head()


Out[17]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 11:30:00 NaN NaN
2017-04-03 11:40:00 NaN -2.00
2017-04-03 11:50:00 10000.00 304.60
2017-04-03 12:00:00 313.60 313.60
2017-04-03 12:10:00 314.21 314.21

In [2]:
data= TSU.remove_start_NaN(data)
data.head()


Out[2]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 11:40:00 NaN -2.00
2017-04-03 11:50:00 10000.00 304.60
2017-04-03 12:00:00 313.60 313.60
2017-04-03 12:10:00 314.21 314.21
2017-04-03 12:20:00 308.37 308.37

In [3]:
# clean start-end
data= TSU.remove_end_NaN(data)
data.tail()


Out[3]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 19:30:00 282.98 282.98
2017-04-03 19:40:00 NaN NaN
2017-04-03 19:50:00 286.86 286.86
2017-04-04 20:00:00 1.00 NaN
2017-04-04 20:10:00 3.00 1000000.00

In [4]:
TSU._find_missing(data).head()


Out[4]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 11:40:00 True False
2017-04-03 11:50:00 False False
2017-04-03 12:00:00 False False
2017-04-03 12:10:00 False False
2017-04-03 12:20:00 False False

In [5]:
TSU.display_missing(data, how="all")


Out[5]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 19:40:00 NaN NaN

In [6]:
TSU.count_missing(data, how="number")


Out[6]:
Jan data1 col1    2
Jan data2 col2    2
dtype: int64

In [7]:
TSU.remove_missing(data,how="any").head()


Out[7]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 11:50:00 10000.00 304.60
2017-04-03 12:00:00 313.60 313.60
2017-04-03 12:10:00 314.21 314.21
2017-04-03 12:20:00 308.37 308.37
2017-04-03 12:30:00 331.07 400000.00

In [8]:
TSU._find_outOfBound(data, 10, 300).head()


Out[8]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 11:40:00 False True
2017-04-03 11:50:00 True True
2017-04-03 12:00:00 True True
2017-04-03 12:10:00 True True
2017-04-03 12:20:00 True True

In [9]:
TSU.display_outOfBound(data, 10, 300)


Out[9]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 11:40:00 NaN -2.00
2017-04-03 11:50:00 10000.00 304.60
2017-04-03 12:00:00 313.60 313.60
2017-04-03 12:10:00 314.21 314.21
2017-04-03 12:20:00 308.37 308.37
2017-04-03 12:30:00 331.07 400000.00
2017-04-03 12:40:00 362.73 362.73
2017-04-03 12:50:00 518.56 518.56
2017-04-03 13:00:00 513.71 513.71
2017-04-03 13:10:00 518.80 518.80
2017-04-03 13:20:00 520.09 520.09
2017-04-03 13:30:00 517.24 517.24
2017-04-03 13:40:00 521.54 521.54
2017-04-03 13:50:00 513.75 513.75
2017-04-03 14:00:00 514.94 514.94
2017-04-03 14:10:00 516.11 516.11
2017-04-03 14:20:00 509.50 509.50
2017-04-03 14:30:00 518.54 518.54
2017-04-03 14:40:00 517.08 517.08
2017-04-03 14:50:00 517.93 517.93
2017-04-03 15:00:00 517.60 517.60
2017-04-03 15:10:00 519.91 519.91
2017-04-03 15:20:00 511.32 511.32
2017-04-03 15:30:00 519.92 519.92
2017-04-03 15:40:00 509.63 509.63
2017-04-03 15:50:00 515.43 515.43
2017-04-03 16:00:00 511.91 511.91
2017-04-03 16:10:00 510.92 510.92
2017-04-03 16:20:00 513.82 513.82
2017-04-03 16:30:00 509.91 509.91
2017-04-03 16:40:00 510.78 510.78
2017-04-03 16:50:00 511.18 511.18
2017-04-03 17:00:00 433.45 433.45
2017-04-03 17:10:00 324.82 324.82
2017-04-03 17:20:00 306.29 306.29
2017-04-04 20:00:00 1.00 NaN
2017-04-04 20:10:00 3.00 1000000.00

In [10]:
TSU.count_outOfBound(data, 10, 300)


Out[10]:
Jan data1 col1    36
Jan data2 col2    36
dtype: int64

In [11]:
TSU.remove_outOfBound(data, 10, 350)


Out[11]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 12:00:00 313.60 313.60
2017-04-03 12:10:00 314.21 314.21
2017-04-03 12:20:00 308.37 308.37
2017-04-03 17:10:00 324.82 324.82
2017-04-03 17:20:00 306.29 306.29
2017-04-03 17:30:00 293.60 293.60
2017-04-03 17:40:00 290.13 290.13
2017-04-03 17:50:00 286.32 286.32
2017-04-03 18:00:00 290.10 290.10
2017-04-03 18:10:00 286.68 286.68
2017-04-03 18:20:00 287.98 287.98
2017-04-03 18:30:00 287.05 287.05
2017-04-03 18:40:00 283.28 283.28
2017-04-03 18:50:00 289.58 289.58
2017-04-03 19:00:00 290.72 290.72
2017-04-03 19:10:00 286.48 286.48
2017-04-03 19:20:00 288.66 288.66
2017-04-03 19:30:00 282.98 282.98
2017-04-03 19:40:00 NaN NaN
2017-04-03 19:50:00 286.86 286.86

In [12]:
TSU.display_outliers(data,method="std",coeff=2, window=10)


Out[12]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 11:50:00 10000.00 304.6
2017-04-03 12:30:00 331.07 400000.0
2017-04-04 20:10:00 3.00 1000000.0

In [13]:
TSU.display_outliers(data,method="rstd",coeff=1, window=10)


Out[13]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 11:50:00 10000.00 304.60
2017-04-03 12:30:00 331.07 400000.00
2017-04-03 14:00:00 514.94 514.94
2017-04-03 14:10:00 516.11 516.11
2017-04-03 14:20:00 509.50 509.50
2017-04-03 14:30:00 518.54 518.54
2017-04-03 14:40:00 517.08 517.08
2017-04-03 14:50:00 517.93 517.93
2017-04-03 15:00:00 517.60 517.60
2017-04-03 15:10:00 519.91 519.91
2017-04-03 15:20:00 511.32 511.32
2017-04-03 15:30:00 519.92 519.92
2017-04-03 15:40:00 509.63 509.63
2017-04-03 15:50:00 515.43 515.43
2017-04-03 16:00:00 511.91 511.91
2017-04-03 16:10:00 510.92 510.92
2017-04-03 16:20:00 513.82 513.82
2017-04-03 16:30:00 509.91 509.91
2017-04-03 16:40:00 510.78 510.78
2017-04-03 16:50:00 511.18 511.18
2017-04-03 17:00:00 433.45 433.45
2017-04-03 17:10:00 324.82 324.82
2017-04-03 17:20:00 306.29 306.29
2017-04-03 17:30:00 293.60 293.60
2017-04-03 17:40:00 290.13 290.13
2017-04-03 17:50:00 286.32 286.32
2017-04-03 18:00:00 290.10 290.10
2017-04-03 18:10:00 286.68 286.68
2017-04-03 18:20:00 287.98 287.98
2017-04-03 18:30:00 287.05 287.05
2017-04-03 18:40:00 283.28 283.28
2017-04-03 18:50:00 289.58 289.58
2017-04-03 19:00:00 290.72 290.72
2017-04-03 19:10:00 286.48 286.48
2017-04-03 19:20:00 288.66 288.66
2017-04-03 19:30:00 282.98 282.98
2017-04-03 19:50:00 286.86 286.86
2017-04-04 20:00:00 1.00 NaN
2017-04-04 20:10:00 3.00 1000000.00

In [14]:
TSU.display_outliers(data,method="rmedian",coeff=1, window=10)


Out[14]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 11:40:00 NaN -2.00
2017-04-03 11:50:00 10000.00 304.60
2017-04-03 12:00:00 313.60 313.60
2017-04-03 12:10:00 314.21 314.21
2017-04-03 12:20:00 308.37 308.37
2017-04-03 12:30:00 331.07 400000.00
2017-04-03 12:40:00 362.73 362.73
2017-04-03 12:50:00 518.56 518.56
2017-04-03 13:00:00 513.71 513.71
2017-04-03 13:10:00 518.80 518.80
2017-04-03 13:20:00 520.09 520.09
2017-04-03 13:40:00 521.54 521.54
2017-04-03 13:50:00 513.75 513.75
2017-04-03 14:00:00 514.94 514.94
2017-04-03 14:10:00 516.11 516.11
2017-04-03 14:20:00 509.50 509.50
2017-04-03 14:30:00 518.54 518.54
2017-04-03 15:10:00 519.91 519.91
2017-04-03 15:20:00 511.32 511.32
2017-04-03 15:30:00 519.92 519.92
2017-04-03 15:40:00 509.63 509.63
2017-04-03 15:50:00 515.43 515.43
2017-04-03 16:20:00 513.82 513.82
2017-04-03 17:00:00 433.45 433.45
2017-04-03 17:10:00 324.82 324.82
2017-04-03 17:20:00 306.29 306.29
2017-04-03 17:30:00 293.60 293.60
2017-04-03 17:40:00 290.13 290.13
2017-04-03 17:50:00 286.32 286.32
2017-04-03 18:00:00 290.10 290.10
2017-04-03 18:10:00 286.68 286.68
2017-04-03 18:40:00 283.28 283.28
2017-04-03 18:50:00 289.58 289.58
2017-04-03 19:00:00 290.72 290.72
2017-04-03 19:10:00 286.48 286.48
2017-04-03 19:20:00 288.66 288.66
2017-04-03 19:30:00 282.98 282.98
2017-04-04 20:00:00 1.00 NaN
2017-04-04 20:10:00 3.00 1000000.00

In [15]:
TSU.display_outliers(data,method="iqr",coeff=1, window=10)


Out[15]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 11:40:00 NaN -2.0
2017-04-03 11:50:00 10000.00 304.6
2017-04-03 12:30:00 331.07 400000.0
2017-04-04 20:00:00 1.00 NaN
2017-04-04 20:10:00 3.00 1000000.0

In [16]:
TSU.display_outliers(data,method="qtl",coeff=1, window=10)


Out[16]:
Jan data1 col1 Jan data2 col2
Timestamp
2017-04-03 11:40:00 NaN -2.0
2017-04-03 11:50:00 10000.0 304.6
2017-04-04 20:00:00 1.0 NaN
2017-04-04 20:10:00 3.0 1000000.0

In [ ]: