In [1]:
import pandas as pd
pd.__version__


Out[1]:
'1.0.3'

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [3]:
# Storing the data in a pandas data frame.

df = pd.read_csv("data/PewDiePie.csv")
df.head(15)


Out[3]:
Date Subscribers
0 1 71915
1 2 48270
2 3 47746
3 4 42276
4 5 36867
5 6 28722
6 7 29794
7 8 33125
8 9 27877
9 10 30675
10 11 32450
11 12 45714
12 13 59377
13 14 42289
14 15 36002

In [4]:
x = df.iloc[:, 0]
x_df = pd.DataFrame(x)
y = df.iloc[:, 1]
print(x_df)
print(y)


    Date
0      1
1      2
2      3
3      4
4      5
5      6
6      7
7      8
8      9
9     10
10    11
11    12
12    13
13    14
14    15
15    16
16    17
17    18
18    19
19    20
0     71915
1     48270
2     47746
3     42276
4     36867
5     28722
6     29794
7     33125
8     27877
9     30675
10    32450
11    45714
12    59377
13    42289
14    36002
15    25872
16    31623
17    46504
18    49421
19    27844
Name: Subscribers, dtype: int64

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_df, y, test_size = 0.3, random_state = 0)
print(x_test)
print(y_test)
print(x_train)
print(y_train)
df.head(10)


    Date
18    19
1      2
19    20
8      9
10    11
17    18
18    49421
1     48270
19    27844
8     27877
10    32450
17    46504
Name: Subscribers, dtype: int64
    Date
6      7
13    14
4      5
2      3
5      6
14    15
9     10
7      8
16    17
11    12
3      4
0      1
15    16
12    13
6     29794
13    42289
4     36867
2     47746
5     28722
14    36002
9     30675
7     33125
16    31623
11    45714
3     42276
0     71915
15    25872
12    59377
Name: Subscribers, dtype: int64
Out[5]:
Date Subscribers
0 1 71915
1 2 48270
2 3 47746
3 4 42276
4 5 36867
5 6 28722
6 7 29794
7 8 33125
8 9 27877
9 10 30675

In [6]:
# training a simple linear regression model
simpleLinearRegression = LinearRegression()
simpleLinearRegression.fit(x_train, y_train)


Out[6]:
LinearRegression()

In [7]:
# test trained model
y_predict = simpleLinearRegression.predict(x_test)
x_test_series = x_test.iloc[:,0]
predict = pd.DataFrame({'X Test': x_test_series, 'Y Predict': y_predict, 'Y Real': y_test})
predict.head(10)


Out[7]:
X Test Y Predict Y Real
18 19 31041.644085 49421
1 2 47086.367846 48270
19 20 30097.836805 27844
8 9 40479.716886 27877
10 11 38592.102326 32450
17 18 31985.451365 46504

In [8]:
# model evaluation
rmse = mean_squared_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)

# printing values
print('Slope:' ,simpleLinearRegression.coef_)
print('Intercept:', simpleLinearRegression.intercept_)
print('Variance score: {}'.format(simpleLinearRegression.score(x_test, y_test)))
print('Root mean squared error: ', rmse)
print('R2 score: ', r2)


Slope: [-943.80728008]
Intercept: 48973.98240647118
Variance score: -0.3881037131710081
Root mean squared error:  125270606.26955862
R2 score:  -0.3881037131710081

In [9]:
# plotting values

# data points
plt.bar(x_test_series, y_predict, label="Y Predicted", color=[1,0,0, 0.5])
plt.bar(x_test_series, y_test, label="Y Real", color=[0,1,0, 0.5])
plt.plot()

plt.plot(x_test_series, y_predict, color='r')

plt.xlabel("X axis")
plt.ylabel("Y axis")
plt.title("Number of New Visitors")
plt.legend()
plt.show()



In [ ]: