Import important files



In [306]:

    
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd



In [307]:

    
# Showing matplotlib plots in jupyter notebook
%matplotlib inline

Read Data



In [308]:

    
# Getting Training dataset ( Variant #1 )
# Importing data as Dataframe
df = pd.read_csv('../Dataset/Dataset/Training/Features_Variant_1.csv',sep=',', header=None)

# Importing data as numpy array 
X = np.genfromtxt('../Dataset/Dataset/Training/Features_Variant_1.csv', delimiter=",")



In [309]:

    
y = X[:,53]

Functions Section



In [310]:

    
def matrixToNumber(df):
    
    # Checking df type that whether is Dataframe or not!
    if ( type(df) == pd.core.frame.DataFrame ):
        df_temp = np.ones((df.shape[0],1), dtype='int')

    # Converting df to Dataframe if df is Array or Series
    elif ( type(df) == numpy.ndarray or type(df) == pd.core.frame.Series ):
        df = pd.DataFrame(df)
        df_temp = np.ones((df.shape[0],1), dtype='int')
        
    else:
        return "Function expect Dataframe"
    
    # Making columns labels to start from 0
    df.columns = df.columns - df.columns[0]
        
    # Finding day of week post had published
    for a in df.columns:
        df_temp[df[a] == 1] = int(df.shape[1]) - int(a) - 1
    
    # returning numpy array for easier later uses
    return np.array(df_temp).T[0]

# Drop column with integer label
def drop_int(df, val):
    
    # Checking df type that whether is Dataframe or not!
    if ( type(df) == pd.core.frame.DataFrame ):
        df = df.T.drop(val,axis=0).T
        df.rename(columns=(lambda x: ( x - 1 if x > val else x)),inplace=True)
        
        return df
    else:
        return "Function expect Dataframe"

def plotLearningCurves(X,y,step):
    m,n = X.shape
    maxVal = (int)(m / 10) * 10
    N_size_arr = np.arange(10, maxVal + 10, step)
    error_arr = np.zeros(( len(np.arange(10, maxVal + 10, step)) ,2 ))
    index = 0
    
    # Increasing train dataset size, "step" times in each iteration
    for i in N_size_arr:
        
        # Splitting Training dataset with size i into train and cross validation sets
        X_train, X_test, y_train, y_test = train_test_split(X[:i,:], y[:i], test_size=0.33, random_state=42)
        
        # Fitting Model
        lm.fit(X_train, y_train)
        
        # Computing both mean squared error of training dataset and cross validation datasets predections
        error_arr[index,0] = mean_squared_error(y_train , lm.predict(X_train))
        error_arr[index,1] = mean_squared_error(y_test, lm.predict(X_test))
        
        # Increasing index with 1
        index += 1
    
    # Initializing the figure
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_axes([0,0,1,1])
    ax.set_yscale('log')
    
    # Plotting "Training set size" vs. "Mean Squared Error" for both of the train and cross validation dataset's errors
    line1, = ax.plot(N_size_arr,error_arr[:,0], c='red')
    line2, = ax.plot(N_size_arr,error_arr[:,1], c='blue')
    
    # Adding labels && legends to our plot
    ax.set_xlabel("N (Training set size)")
    ax.set_ylabel("Mean Squared Error")
    
    ax.legend((line1,line2),("Train Error","Test Error"))



In [311]:

    
a = np.ones((12))
type(a)









    Out[311]:





numpy.ndarray

Dataset Analysis



In [312]:

    
# Getting first 5 training data in dataset
df.head(10)









    Out[312]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
    
  
  
    
      0
      634995
      0
      463
      1
      0.0
      806.0
      11.291045
      1.0
      70.495138
      0.0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      1
      634995
      0
      463
      1
      0.0
      806.0
      11.291045
      1.0
      70.495138
      0.0
      ...
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      2
      634995
      0
      463
      1
      0.0
      806.0
      11.291045
      1.0
      70.495138
      0.0
      ...
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      3
      634995
      0
      463
      1
      0.0
      806.0
      11.291045
      1.0
      70.495138
      0.0
      ...
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      4
      634995
      0
      463
      1
      0.0
      806.0
      11.291045
      1.0
      70.495138
      0.0
      ...
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      5
      634995
      0
      463
      1
      0.0
      806.0
      11.291045
      1.0
      70.495138
      0.0
      ...
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
    
    
      6
      634995
      0
      463
      1
      0.0
      806.0
      11.291045
      1.0
      70.495138
      0.0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      7
      634995
      0
      463
      1
      0.0
      806.0
      11.291045
      1.0
      70.495138
      0.0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      8
      634995
      0
      463
      1
      0.0
      806.0
      11.291045
      1.0
      70.495138
      0.0
      ...
      1
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      9
      634995
      0
      463
      1
      0.0
      806.0
      11.291045
      1.0
      70.495138
      0.0
      ...
      1
      0
      1
      0
      0
      0
      0
      0
      0
      0
    
  

10 rows × 54 columns



In [313]:

    
# example of numpy "arr" numpy array
X[0]









    Out[313]:





array([  6.34995000e+05,   0.00000000e+00,   4.63000000e+02,
         1.00000000e+00,   0.00000000e+00,   8.06000000e+02,
         1.12910448e+01,   1.00000000e+00,   7.04951385e+01,
         0.00000000e+00,   8.06000000e+02,   7.57462687e+00,
         0.00000000e+00,   6.94358264e+01,   0.00000000e+00,
         7.60000000e+01,   2.60447761e+00,   0.00000000e+00,
         8.50550187e+00,   0.00000000e+00,   8.06000000e+02,
         1.06492537e+01,   1.00000000e+00,   7.02547876e+01,
        -6.90000000e+01,   8.06000000e+02,   4.97014925e+00,
         0.00000000e+00,   6.98505804e+01,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   6.50000000e+01,   1.66000000e+02,
         2.00000000e+00,   0.00000000e+00,   2.40000000e+01,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   1.00000000e+00,   0.00000000e+00])



In [314]:

    
# Disovering more information about dataset
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40949 entries, 0 to 40948
Data columns (total 54 columns):
0     40949 non-null int64
1     40949 non-null int64
2     40949 non-null int64
3     40949 non-null int64
4     40949 non-null float64
5     40949 non-null float64
6     40949 non-null float64
7     40949 non-null float64
8     40949 non-null float64
9     40949 non-null float64
10    40949 non-null float64
11    40949 non-null float64
12    40949 non-null float64
13    40949 non-null float64
14    40949 non-null float64
15    40949 non-null float64
16    40949 non-null float64
17    40949 non-null float64
18    40949 non-null float64
19    40949 non-null float64
20    40949 non-null float64
21    40949 non-null float64
22    40949 non-null float64
23    40949 non-null float64
24    40949 non-null float64
25    40949 non-null float64
26    40949 non-null float64
27    40949 non-null float64
28    40949 non-null float64
29    40949 non-null int64
30    40949 non-null int64
31    40949 non-null int64
32    40949 non-null int64
33    40949 non-null int64
34    40949 non-null int64
35    40949 non-null int64
36    40949 non-null int64
37    40949 non-null int64
38    40949 non-null int64
39    40949 non-null int64
40    40949 non-null int64
41    40949 non-null int64
42    40949 non-null int64
43    40949 non-null int64
44    40949 non-null int64
45    40949 non-null int64
46    40949 non-null int64
47    40949 non-null int64
48    40949 non-null int64
49    40949 non-null int64
50    40949 non-null int64
51    40949 non-null int64
52    40949 non-null int64
53    40949 non-null int64
dtypes: float64(25), int64(29)
memory usage: 16.9 MB



In [315]:

    
df.describe()









    Out[315]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
    
  
  
    
      count
      4.094900e+04
      40949.000000
      4.094900e+04
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      ...
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
    
    
      mean
      1.313814e+06
      4676.133752
      4.480025e+04
      24.254780
      1.586241
      443.333854
      55.720384
      35.645535
      67.464151
      0.219468
      ...
      0.146157
      0.136926
      0.141640
      0.132506
      0.137635
      0.148599
      0.150846
      0.143886
      0.144888
      7.322889
    
    
      std
      6.785752e+06
      20593.184863
      1.109338e+05
      19.950583
      20.753174
      496.695198
      86.933548
      69.960232
      81.568249
      10.055146
      ...
      0.353268
      0.343774
      0.348684
      0.339045
      0.344520
      0.355698
      0.357903
      0.350979
      0.351992
      35.494550
    
    
      min
      3.600000e+01
      0.000000
      0.000000e+00
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      3.673400e+04
      0.000000
      6.980000e+02
      9.000000
      0.000000
      45.000000
      5.527273
      2.000000
      8.278756
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      2.929110e+05
      0.000000
      7.045000e+03
      18.000000
      0.000000
      241.000000
      23.374101
      12.000000
      35.069140
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      1.204214e+06
      99.000000
      5.026400e+04
      32.000000
      0.000000
      717.000000
      71.828829
      42.000000
      102.554954
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      3.000000
    
    
      max
      4.869723e+08
      186370.000000
      6.089942e+06
      106.000000
      2341.000000
      2341.000000
      2341.000000
      2341.000000
      731.394558
      1923.000000
      ...
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1305.000000
    
  

8 rows × 54 columns



In [316]:

    
# Getting unique values of "H Local" Feature
np.unique(X[:,38])









    Out[316]:





array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.])

Feature Normalize



In [317]:

    
# Import preprocessing functions in sklearn
from sklearn import preprocessing

# Normalize dataset features
X_norm =  preprocessing.normalize(X[:,:53],axis= 1, copy= True)
X_temp = X.copy()
X_temp[:,0:53] = X_norm
X_temp[:,53] = X[:,53]
X_norm = X_temp

# Converting X_norm to Dataframe
df_norm = pd.DataFrame(X_norm)



In [318]:

    
# Analysis data after normalization
df_norm.describe()









    Out[318]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
    
  
  
    
      count
      40949.000000
      40949.000000
      40949.000000
      4.094900e+04
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      4.094900e+04
      ...
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
      40949.000000
    
    
      mean
      0.983812
      0.025267
      0.058903
      2.217905e-03
      0.000008
      0.002787
      0.000311
      0.000179
      0.000434
      6.565721e-07
      ...
      0.000010
      0.000010
      0.000010
      0.000009
      0.000012
      0.000013
      0.000011
      0.000011
      0.000010
      7.322889
    
    
      std
      0.067383
      0.116617
      0.081969
      1.217793e-02
      0.000256
      0.008209
      0.000954
      0.000683
      0.001181
      7.319091e-05
      ...
      0.000111
      0.000118
      0.000121
      0.000103
      0.000192
      0.000186
      0.000148
      0.000144
      0.000121
      35.494550
    
    
      min
      0.047232
      0.000000
      0.000000
      6.148958e-09
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000e+00
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.994518
      0.000000
      0.007092
      8.974377e-06
      0.000000
      0.000291
      0.000036
      0.000009
      0.000050
      0.000000e+00
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.999174
      0.000000
      0.030617
      7.406778e-05
      0.000000
      0.000776
      0.000085
      0.000038
      0.000111
      0.000000e+00
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      0.999874
      0.000198
      0.072375
      6.367665e-04
      0.000000
      0.002274
      0.000250
      0.000126
      0.000338
      0.000000e+00
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      3.000000
    
    
      max
      1.000000
      0.988438
      0.566954
      7.053838e-01
      0.026099
      0.103350
      0.026099
      0.026099
      0.019514
      1.304934e-02
      ...
      0.005158
      0.006187
      0.006187
      0.004974
      0.021262
      0.021482
      0.009161
      0.011600
      0.008462
      1305.000000
    
  

8 rows × 54 columns

Adding Average Of CC2, CC3, CC4 for later use



In [319]:

    
# We use this column when we want to plot "Average of Total Comments" vs. "Pages Likes" and etc.
temp = np.mean(X_norm[:,31:34],axis=1)

# Inserting new column before target column
X_norm = np.insert(arr=X_norm,obj=53,values=temp, axis=1)

# Converting new created Matrix to dataframe
df_norm = pd.DataFrame(X_norm)

Convert all of the columns relates to "Post Published Weekday" to pd.Series



In [320]:

    
weekday = matrixToNumber(df.T[39:46].T)
X = np.insert(arr=X,obj=53,values=weekday, axis=1)
df = pd.DataFrame(X)

Plot the data



In [321]:

    
fig = plt.figure()

axes1 = fig.add_axes([0, 2, 0.8, 0.8])
axes2 = fig.add_axes([1,2,0.8,0.8])
axes3 = fig.add_axes([0,1,0.8,0.8])
axes4 = fig.add_axes([1,1,0.8,0.8])
axes5 = fig.add_axes([0,0,0.8,0.8])

# "Pages Likes/Popularity" Vs. "Pages Category"
axes1.plot(X_norm[:,35], X_norm[:,0], marker='o', markersize=5, lw= 0)
axes1.set_xlabel("Post Length")
axes1.set_ylabel("Post Likes/Popularity")

# =====> We can conclude when posts become very long, popularity of the posts decrease !!!!

# "Page popularity/Likes" vs. "Share Counts"
axes2.plot(X_norm[:,0], X_norm[:,36], marker='o', markersize=5, lw= 0)
axes2.set_xlabel("Post Likes/Popularity")
axes2.set_ylabel("Post Share Count ")

# ======> We can conclude when posts populartiy increases, the amounts of share increases too.

# "Page talking about" vs. "Share Counts"
axes3.plot(X_norm[:,2], X_norm[:,36], marker='o', markersize=5, lw= 0)
axes3.set_xlabel("Page talking about")
axes3.set_ylabel("Post Share Count")

# ======> There is no good relation that make sense

# "Average of Comments ( Average of CC2, CC3, CC4 )" vs. "Page popularity/likes"
axes4.plot(X_norm[:,0],X_norm[:,53], marker='o', markersize=5, lw= 0)
axes4.set_xlabel("Page popularity/likes")
axes4.set_ylabel("Average of Comments ( Average of CC2, CC3, CC4 )")

# ======> We can conclude that when post popularity increase, the amounts of the comments increase too.

# "Average of Comments ( Average of CC2, CC3, CC4 )" vs. "Page popularity/likes"
axes5.plot(X_norm[:,35],X_norm[:,53], marker='o', markersize=5, lw= 0)
axes5.set_xlabel("Post Length")
axes5.set_ylabel("Average of Comments ( Average of CC2, CC3, CC4 )")

# ======> We can conclude when posts become very long, average numbers of the comments decrease !!!!









    Out[321]:





<matplotlib.text.Text at 0xa41beac390>



In [322]:

    
fig = plt.figure()

axes1 = fig.add_axes([0, 2, 0.8, 0.8])
axes2 = fig.add_axes([1,2,0.8,0.8])
axes3 = fig.add_axes([0,1,0.8,0.8])
axes4 = fig.add_axes([1,1,0.8,0.8])
axes5 = fig.add_axes([0,0,1.8,0.8])

# Before Plotting the data, we should convert all of the columns relates to "Post Published Weekday" to pd.Series
weekday = matrixToNumber(df.T[39:46].T)

# plot "Post published weekday" Countplot
sns.countplot(weekday ,palette='viridis', ax= axes1)
axes1.set_xlabel("Published Weekday")
axes1.set_xticklabels(["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"])

# plot "H local" Countplot
axes2.set_yscale("log")
sns.countplot(df[38].astype('int') ,palette= 'viridis', ax= axes2)
axes2.set_xlabel("H Local")

# =====> We can see that majority of posts published at 24 H Local and Wednesday

# Plot "Promotion" Countplot
axes3.set_yscale("log")
sns.countplot(df[37] ,palette= 'viridis', ax= axes3)
axes3.set_xlabel("Promotion")

# =====> We can see that all of the posts promotion status is 0, So This column has no effect on our predections
# =====> So we can ignore this column

# Plot "Target Variable" Countplot
sns.countplot(df[54].astype('int'), palette= 'viridis', ax= axes4)
axes4.set_yscale("log")
axes4.set_xlabel("Target Varible")

# Plot "Category" Countplot
sns.countplot(df[3].astype('int'),palette= 'viridis', ax= axes5)
axes5.set_xlabel("Page Category")

# ======> We can see 9, 18 and 36 categorie's have occupied majority of posts









    Out[322]:





<matplotlib.text.Text at 0xa40da08cc0>



In [323]:

    
# "Post published weekday" vs. "H local"
plt.plot(weekday,df[38], marker='o', markersize=5, lw= 0)

# =====> We can see except Wendsday and Thursday, most of the post's published in 24 H Local









    Out[323]:





[<matplotlib.lines.Line2D at 0xa412a17470>]



In [324]:

    
fig = plt.figure(figsize=(12,10))

axes1 = fig.add_axes([0, 2, 2, 1])
axes2 = fig.add_axes([0,1,2,1])

# Plot "Category" vs. "Popularity" barplot
axes1.set_yscale('log')
sns.barplot(x="3",y="0",data=df.rename(columns=(lambda x: str(x))).astype('int'), palette='viridis',ax=axes1)
axes1.set_xlabel("Page Category")
axes1.set_ylabel("Page Popularity/Likes")

# =====> We can conclude that "47", "61" and "33" has the majority of the popularity

# Plot "Category" vs. "Share amount" barplot
sns.barplot(x="3",y="36",data=df.rename(columns=(lambda x: str(x))).astype('int'), palette='viridis',ax=axes2)
axes2.set_xlabel("Page Category")
axes2.set_ylabel("Page Share Amounts")
# =====> We can conclude that "47", "61" and "33" has the majority of the Share Amounts









    Out[324]:





<matplotlib.text.Text at 0xa40dea0f98>

Predection Section



In [325]:

    
# Using sklearn for spliting data into train_set and test_set
from sklearn.cross_validation import train_test_split

# Using Sklearn linear regression function for finding best thetas ( Linear Regression with multiple Varibles )
from sklearn.linear_model import LinearRegression



In [326]:

    
# Before Fitting model, We should get rid of columns that has no effect on our predections
X_norm = np.delete(X_norm, 37, 1)



In [327]:

    
# Spliting Normalized dataset into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.33, random_state=42)



In [328]:

    
# Initializing Model
lm = LinearRegression()



In [329]:

    
# Fitting data to model
lm.fit(X_train,y_train)









    Out[329]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [330]:

    
# Model thetas
lm.coef_









    Out[330]:





array([ -1.97259076e-13,  -1.26010313e-13,  -4.30211422e-16,
         1.42811964e-13,  -2.66246535e-11,  -1.40792399e-12,
         1.75120599e-11,  -4.76140988e-12,   2.13526862e-11,
         4.43723313e-11,   1.63138981e-12,  -1.34139799e-11,
        -9.68417399e-12,   4.75381053e-12,  -6.13361841e-11,
        -1.62491033e-12,  -5.47973212e-12,  -8.26207156e-12,
        -3.42591410e-12,   3.07854164e-11,   1.23604476e-12,
        -1.01151792e-11,   3.36087324e-12,  -2.47833111e-11,
        -1.72085435e-12,  -1.63807976e-12,  -7.93424782e-12,
         1.22382193e-11,   4.61429518e-12,   9.11239338e-13,
        -2.52168403e-13,  -1.41959039e-13,  -2.15403977e-13,
        -1.10209330e-13,   4.45187139e-15,  -8.91751446e-14,
         6.09518176e-13,  -1.11607898e-12,   1.40154956e-11,
         1.95080439e-11,   1.67083511e-11,   1.31772849e-11,
         1.44126001e-11,   7.20496133e-12,   6.32849850e-12,
         1.82138656e-11,   1.48197957e-11,   1.13775765e-11,
         1.31492350e-11,   8.10719909e-12,   1.39769934e-11,
         1.17105702e-11,  -1.55857446e-13,   1.00000000e+00])



In [331]:

    
# Simplest way for evaluating model ( Squared Mean Error )
pred = lm.predict(X_test)
eval_arr = np.sum(np.power( pred - y_test , 2),axis=0) / len(pred)



In [332]:

    
# Plot predections vs. y_test for better understanding how our model works!
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])

ax.set_xlabel("Predections")
ax.set_ylabel("Test Target Varible")
ax.plot(pred, y_test,'bo')

# ====> We can conclude that our model works perfect









    Out[332]:





[<matplotlib.lines.Line2D at 0xa41c738ef0>]



In [333]:

    
# Evaluating model with sklearn functions
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error



In [334]:

    
print('Mean Absolute Error:', mean_absolute_error(y_test, pred))
print('Mean Squared Error:',mean_squared_error(y_test, pred))









    



Mean Absolute Error: 4.71754761292e-15
Mean Squared Error: 1.31265944701e-28



In [335]:

    
"Precision Of model: {}%". format( ( 1 - mean_absolute_error(y_test,pred) ) * 100 )









    Out[335]:





'Precision Of model: 99.99999999999953%'



In [336]:

    
plotLearningCurves(X_norm,y,500)
# Because There is no continous gap between Train Error and Test Error, So Our Model isn't suffering from high variance
# Becase as we get more data, both train and test errors are decreasing and are very small, So Our model isn't suffering from high bias



In [ ]:

	0	2	3	5	6	7	8	...	44	46	47	49	51	52
0	634995	463	1	806.0	11.291045	1.0	70.495138	...	0	0	0	0	0	1
1	634995	463	1	806.0	11.291045	1.0	70.495138	...	0	0	0	0	1	0
2	634995	463	1	806.0	11.291045	1.0	70.495138	...	1	0	0	0	0	1
3	634995	463	1	806.0	11.291045	1.0	70.495138	...	1	0	1	0	0	0
4	634995	463	1	806.0	11.291045	1.0	70.495138	...	0	0	0	1	0	0
5	634995	463	1	806.0	11.291045	1.0	70.495138	...	0	0	0	0	1	0
6	634995	463	1	806.0	11.291045	1.0	70.495138	...	0	0	0	0	0	1
7	634995	463	1	806.0	11.291045	1.0	70.495138	...	0	0	0	0	0	1
8	634995	463	1	806.0	11.291045	1.0	70.495138	...	1	0	0	0	0	1
9	634995	463	1	806.0	11.291045	1.0	70.495138	...	1	1	0	0	0	0

	0	1	2	3	4	5	6	7	8	9	...	44	45	46	47	48	49	50	51	52	53
count	4.094900e+04	40949.000000	4.094900e+04	40949.000000	40949.000000	40949.000000	40949.000000	40949.000000	40949.000000	40949.000000	...	40949.000000	40949.000000	40949.000000	40949.000000	40949.000000	40949.000000	40949.000000	40949.000000	40949.000000	40949.000000
mean	1.313814e+06	4676.133752	4.480025e+04	24.254780	1.586241	443.333854	55.720384	35.645535	67.464151	0.219468	...	0.146157	0.136926	0.141640	0.132506	0.137635	0.148599	0.150846	0.143886	0.144888	7.322889
std	6.785752e+06	20593.184863	1.109338e+05	19.950583	20.753174	496.695198	86.933548	69.960232	81.568249	10.055146	...	0.353268	0.343774	0.348684	0.339045	0.344520	0.355698	0.357903	0.350979	0.351992	35.494550
min	3.600000e+01	0.000000	0.000000e+00	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	3.673400e+04	0.000000	6.980000e+02	9.000000	0.000000	45.000000	5.527273	2.000000	8.278756	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	2.929110e+05	0.000000	7.045000e+03	18.000000	0.000000	241.000000	23.374101	12.000000	35.069140	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	1.204214e+06	99.000000	5.026400e+04	32.000000	0.000000	717.000000	71.828829	42.000000	102.554954	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	3.000000
max	4.869723e+08	186370.000000	6.089942e+06	106.000000	2341.000000	2341.000000	2341.000000	2341.000000	731.394558	1923.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1305.000000