In [1]:
# Load CSV
import pandas as pd
import numpy as np
filename = 'total_cases.csv'
# Loading with Pandas
data = pd.read_csv(filename)
# Transforming 'object' data to 'categorical' to get numerical (ordinal numbers) representation
data['date'] = data['date'].astype('category')
data['Day'] = data['date'].cat.codes
print(data.shape)
data
#pd.options.display.max_rows=100
Out[1]:
In [2]:
X_single = data['Day'].values #Feature matrix
y = data['World'].values #Target variable
In [8]:
import matplotlib.pyplot as plt
import matplotlib._color_data as mcd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
# create matrix versions of these arrays
test_record_number = 5
X_train = X_single[:X_single.shape[0]-test_record_number, np.newaxis]
y_train = y[:X_single.shape[0]-test_record_number]
X_test = X_single[X_single.shape[0]-test_record_number:, np.newaxis]
y_test = y[X_single.shape[0]-test_record_number:]
X_test
Out[8]:
In [12]:
# set up a plot
lw = 2
plt.rcParams["figure.figsize"] = (12, 7)
plt.plot(X_single, y, color='cornflowerblue', linewidth=lw, label="Total cases COVID19")
plt.scatter(X_train, y_train, color='navy', s=30, marker='o', label="training points")
# Polynomial regression with different degree
for degree in range(5, 6):
#Create a pipeline for a polynomial regression model
model = Pipeline([('poly', PolynomialFeatures(degree=degree)),
('linear', LinearRegression(fit_intercept=False))
])
model = model.fit(X_train, y_train)
y_poly_forecast = model.predict(X_test).astype(int)
color = 'C' + str(degree)
plt.plot(X_test, y_poly_forecast, color=color, linewidth=lw,
label="Polynomial regression (degree=%d)" % degree)
X_test2 = np.arange(81,100).reshape(-1,1)
y_poly_forecast2 = model.predict(X_test2).astype(int)
df1 = pd.DataFrame({'x':X_test2[:,0], 'y':y_poly_forecast2})
print (df1)
# Logistic regression
lr_model = LogisticRegression(solver='liblinear', tol=1e-6, max_iter=int(1e6))
lr_model.fit(X_train, y_train)
y_lr_forecast = lr_model.predict(X_test).astype(int)
#plt.plot(X_test, y_lr_forecast, color='gold', linewidth=lw+1, label="Logistic regression")
# Random Forest
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_rf_forecast = rf_model.predict(X_test).astype(int)
#plt.plot(X_test, y_rf_forecast, color='darkgreen', linewidth=lw+1, label="Random Forest")
plt.legend(loc='best')
plt.show()