In [1]:
import pandas as pd # 處理表格
import numpy as np # 科學運算
import matplotlib.pyplot as plt # 畫圖
import matplotlib as mpl # 畫圖
from datetime import timedelta, datetime # 時間處理
from sklearn import model_selection, svm # 機器學習套件
from sklearn.linear_model import LinearRegression # 機器學習套件
from sklearn.ensemble import RandomForestRegressor # 機器學習套件
In [2]:
df1 = pd.read_csv('files/C_waterlevel.csv', encoding='big5', na_values='缺測'.encode('big5')) # C測站水位資料
df2 = pd.read_csv('files/W_waterlevel.csv', encoding='big5', na_values='缺測'.encode('big5')) # W測站水位資料
df3 = pd.read_csv('files/CT_rainfall.csv', encoding='big5') # CT測站雨量資料
df4 = pd.read_csv('files/DS_rainfall.csv') # DS測站雨量資料
In [3]:
df1.head()
Out[3]:
In [4]:
df3.head()
Out[4]:
In [5]:
for i in [df1, df2, df3, df4]:
i.columns.values[0] = 'time'
In [6]:
df1.head()
Out[6]:
In [7]:
df1.fillna(method='ffill', inplace=True)
df2.fillna(method='ffill', inplace=True)
df3.fillna(method='ffill', inplace=True)
df4.fillna(method='ffill', inplace=True)
In [8]:
starttime = datetime(2014, 1, 1, 0, 0, 0) #設定資料起始時間
interval = timedelta(hours=1) #設定縱軸間隔
print(starttime)
In [9]:
time = [starttime + i * interval for i in range(len(df1) * 24)]
print(time[:3])
In [10]:
# 取用時間欄位後面的欄位,1~24小時的部分,轉成 numpy array 後弄成一維 (ravel())
def ravel_table(x):
return x.iloc[:,1:].as_matrix().ravel()
In [11]:
# 將四個一維 array 放入新的表格 df 中
# cl: C站水位, wl: W站水位, cr: CT站雨量, dr: DS站雨量
df = pd.DataFrame({'time': time, 'cl': ravel_table(df1), 'wl': ravel_table(df2),\
'cr': ravel_table(df3), 'dr': ravel_table(df4)})
df = df[['time', 'cl', 'wl', 'cr', 'dr']]
df.set_index('time', inplace=True)
In [12]:
df[df < 0] = 0 # 將小於零的空值(e.g., -999996.0)改成 0
df.fillna(0, inplace=True) # 將 nan 空值改成 0
In [13]:
df.head()
Out[13]:
In [14]:
# 設定圖片大小
mpl.rcParams['figure.figsize'] = (10,10)
# 設定繪圖風格
plt.style.use('ggplot')
# 分割成 3X1
ax1 = plt.subplot2grid((3,1),(0,0))
ax2 = plt.subplot2grid((3,1),(1,0), sharex=ax1)
ax3 = plt.subplot2grid((3,1),(2,0), sharex=ax1)
df[['cl','wl']].plot(ax= ax1, linewidth = 1, color=['r','b'])
df['cr'].plot(ax= ax2, label="rf1", linewidth = 1, color='g')
df['dr'].plot(ax= ax3, label="rf2", linewidth = 1)
ax1.set_ylabel('Waterlevel')
ax2.set_ylabel('CT rainfall')
ax3.set_ylabel('DS rainfall')
plt.show()
In [15]:
df['cl'].plot()
plt.xlim(datetime(2014, 1, 1, 0, 0, 0), datetime(2014, 3, 10, 0, 0, 0))
plt.ylim(-0.1, 1.1)
plt.show()
In [16]:
# 設定從 1 小時前到 10 小時前
for i in range(1,11):
df['cr'+str(i)] = df['cr'].shift(i) # CT rainfall
df['dr'+str(i)] = df['dr'].shift(i) # DAS rainfall
df['wl'+str(i)] = df['wl'].shift(i)
df.fillna(0, inplace=True)
In [17]:
df.head(2)
Out[17]:
In [18]:
df.corr()[['cl', 'wl', 'cr', 'dr']]
Out[18]:
In [19]:
plt.scatter(df['wl'], df['cr4'])
plt.xlabel('water level')
plt.ylabel('rainfall')
plt.show()
In [20]:
df_feature = df[['wl1', 'wl2', 'cr2', 'cr3', 'cr4', 'cr5', 'cr6']]
X = np.array(df_feature)
y = np.array(df['wl'])
In [21]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=0)
In [22]:
clf = LinearRegression(n_jobs=-1)
# clf = svm.SVR()
# clf = RandomForestRegressor(n_estimators = 10, random_state = 0, n_jobs=-1)
clf.fit(X_train, y_train)
Out[22]:
In [23]:
accuracy = clf.score(X_test, y_test)
print ('two hours in advance')
print ('accuracy,', round(accuracy, 3))
In [24]:
def read_predict_file(file_name, start, stop_condition):
df = pd.read_csv(file_name, encoding='big5', usecols=range(3))
df.columns = ['time2','waterlevel','rainfall']
df.drop(['time2'], 1, inplace=True)
df[df < 0] = 0
df['daysvalue'] = df['rainfall'].rolling(window = stop_condition).mean()
df.fillna(0.00000001, inplace=True)
df = df[df['daysvalue']>0]
#print (df)
time_index = []
startpoint = datetime(start[0],start[1],start[2],start[3],0,0)
for i in range(len(df.index)):
time_index.append(startpoint)
startpoint += interval
df['time'] = pd.Series(time_index, index = df.index)
df.set_index('time', inplace=True)
df[df < 0] = 0
for i in range(2,8):
df['rainfall'+str(i)] = df['rainfall'].shift(i)
df['waterlevel1'] = df['waterlevel'].shift(1)
df['waterlevel2'] = df['waterlevel'].shift(2)
#df['waterlevel3'] = df['waterlevel'].shift(3)
df.dropna(inplace=True)
return df
In [25]:
df_0610 = read_predict_file('files/0610rain.csv', [2016, 6, 10, 0], 48)
df_0706 = read_predict_file('files/0706rain.csv', [2016, 7, 6, 0], 48)
In [26]:
df_0610.head()
Out[26]:
In [27]:
def predict_df(df, clf):
# ------------------預測------------------
# 輸入特徵值
df_feature_predict_me = df[['waterlevel1', 'waterlevel2','rainfall2','rainfall3','rainfall4','rainfall5','rainfall6']]
# 轉成 numpy array
X_predict_me = np.array(df_feature_predict_me)
# 用上面完成的模型來預測 輸入 X 得到 y
forecast_set = clf.predict(X_predict_me)
# 將預測出來的值在原有表格新增一個欄位放進去
df['predict_wl'] = pd.Series(forecast_set, index = df.index)
# ----------------------------------------
# ------------------算誤差-----------------
error_sum = 0
error_com_sum = 0
error_square_sum = 0
com_error_square_sum = 0
for i in range(len(df.index)):
error_square = ((df['predict_wl'][i] - df['waterlevel'][i]) ** 2)
error_square_sum += error_square
com_error_square = ((df['waterlevel2'][i] - df['waterlevel'][i]) ** 2)
com_error_square_sum += com_error_square
rmse = np.sqrt(error_square_sum / len(df.index))
brmse = np.sqrt(com_error_square_sum / len(df.index))
print ('RMS error:', round(rmse*100, 2), 'cm')
# ----------------------------------------
In [28]:
# 設定圖片大小
mpl.rcParams['figure.figsize'] = (15,12)
# 分割成 2X2 四格
ax1 = plt.subplot2grid((2,2),(0,0))
ax2 = plt.subplot2grid((2,2),(1,0), sharex=ax1)
ax3 = plt.subplot2grid((2,2),(0,1))
ax4 = plt.subplot2grid((2,2),(1,1))
df[['cl','wl']].plot(ax= ax1, linewidth = 1, color=['r','b'])
df['cr'].plot(ax= ax2, label="rf1", linewidth = 1, color='g')
ax1.set_ylabel('Waterlevel')
ax2.set_ylabel('CT rainfall')
print ('6/10')
predict_df(df_0610, clf)
df_0610[['waterlevel','predict_wl']].plot(ax= ax3, linewidth = 1)
ax3.set_ylabel('0610_Waterlevel')
print ('7/6')
predict_df(df_0706, clf)
df_0706[['waterlevel','predict_wl']].plot(ax= ax4, linewidth = 1)
ax4.set_ylabel('0706_Waterlevel')
plt.show()
In [29]:
from sklearn.externals import joblib # 可用來儲存模型的套件
# 儲存模型 (第一個參數放模型,第二個放要存成的檔名)
joblib.dump(clf, 'files/clf.pkl')
Out[29]:
In [30]:
clf_new = joblib.load('files/clf.pkl')
In [31]:
# 設定圖片大小
mpl.rcParams['figure.figsize'] = (15,12)
# 分割成 2X2 四格
ax1 = plt.subplot2grid((2,2),(0,0))
ax2 = plt.subplot2grid((2,2),(1,0), sharex=ax1)
ax3 = plt.subplot2grid((2,2),(0,1))
ax4 = plt.subplot2grid((2,2),(1,1))
df[['cl','wl']].plot(ax= ax1, linewidth = 1, color=['r','b'])
df['cr'].plot(ax= ax2, label="rf1", linewidth = 1, color='g')
ax1.set_ylabel('Waterlevel')
ax2.set_ylabel('CT rainfall')
print ('6/10')
predict_df(df_0610, clf_new)
df_0610[['waterlevel','predict_wl']].plot(ax= ax3, linewidth = 1)
ax3.set_ylabel('0610_Waterlevel')
print ('7/6')
predict_df(df_0706, clf_new)
df_0706[['waterlevel','predict_wl']].plot(ax= ax4, linewidth = 1)
ax4.set_ylabel('0706_Waterlevel')
plt.show()