Import important files


In [306]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

In [307]:
# Showing matplotlib plots in jupyter notebook
%matplotlib inline

Read Data


In [308]:
# Getting Training dataset ( Variant #1 )
# Importing data as Dataframe
df = pd.read_csv('../Dataset/Dataset/Training/Features_Variant_1.csv',sep=',', header=None)

# Importing data as numpy array 
X = np.genfromtxt('../Dataset/Dataset/Training/Features_Variant_1.csv', delimiter=",")

In [309]:
y = X[:,53]

Functions Section


In [310]:
def matrixToNumber(df):
    
    # Checking df type that whether is Dataframe or not!
    if ( type(df) == pd.core.frame.DataFrame ):
        df_temp = np.ones((df.shape[0],1), dtype='int')

    # Converting df to Dataframe if df is Array or Series
    elif ( type(df) == numpy.ndarray or type(df) == pd.core.frame.Series ):
        df = pd.DataFrame(df)
        df_temp = np.ones((df.shape[0],1), dtype='int')
        
    else:
        return "Function expect Dataframe"
    
    # Making columns labels to start from 0
    df.columns = df.columns - df.columns[0]
        
    # Finding day of week post had published
    for a in df.columns:
        df_temp[df[a] == 1] = int(df.shape[1]) - int(a) - 1
    
    # returning numpy array for easier later uses
    return np.array(df_temp).T[0]

# Drop column with integer label
def drop_int(df, val):
    
    # Checking df type that whether is Dataframe or not!
    if ( type(df) == pd.core.frame.DataFrame ):
        df = df.T.drop(val,axis=0).T
        df.rename(columns=(lambda x: ( x - 1 if x > val else x)),inplace=True)
        
        return df
    else:
        return "Function expect Dataframe"

def plotLearningCurves(X,y,step):
    m,n = X.shape
    maxVal = (int)(m / 10) * 10
    N_size_arr = np.arange(10, maxVal + 10, step)
    error_arr = np.zeros(( len(np.arange(10, maxVal + 10, step)) ,2 ))
    index = 0
    
    # Increasing train dataset size, "step" times in each iteration
    for i in N_size_arr:
        
        # Splitting Training dataset with size i into train and cross validation sets
        X_train, X_test, y_train, y_test = train_test_split(X[:i,:], y[:i], test_size=0.33, random_state=42)
        
        # Fitting Model
        lm.fit(X_train, y_train)
        
        # Computing both mean squared error of training dataset and cross validation datasets predections
        error_arr[index,0] = mean_squared_error(y_train , lm.predict(X_train))
        error_arr[index,1] = mean_squared_error(y_test, lm.predict(X_test))
        
        # Increasing index with 1
        index += 1
    
    # Initializing the figure
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_axes([0,0,1,1])
    ax.set_yscale('log')
    
    # Plotting "Training set size" vs. "Mean Squared Error" for both of the train and cross validation dataset's errors
    line1, = ax.plot(N_size_arr,error_arr[:,0], c='red')
    line2, = ax.plot(N_size_arr,error_arr[:,1], c='blue')
    
    # Adding labels && legends to our plot
    ax.set_xlabel("N (Training set size)")
    ax.set_ylabel("Mean Squared Error")
    
    ax.legend((line1,line2),("Train Error","Test Error"))

In [311]:
a = np.ones((12))
type(a)


Out[311]:
numpy.ndarray

Dataset Analysis


In [312]:
# Getting first 5 training data in dataset
df.head(10)


Out[312]:
0 1 2 3 4 5 6 7 8 9 ... 44 45 46 47 48 49 50 51 52 53
0 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 0 1 0
1 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 1 0 0
2 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 1 0 0 0 0 0 0 0 1 0
3 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 1 0 0 1 0 0 0 0 0 0
4 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 1 0 0 0 0
5 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 1 0 0
6 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 0 1 0
7 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 0 0 0 0 0 0 0 0 1 0
8 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 1 0 0 0 0 0 0 0 1 0
9 634995 0 463 1 0.0 806.0 11.291045 1.0 70.495138 0.0 ... 1 0 1 0 0 0 0 0 0 0

10 rows × 54 columns


In [313]:
# example of numpy "arr" numpy array
X[0]


Out[313]:
array([  6.34995000e+05,   0.00000000e+00,   4.63000000e+02,
         1.00000000e+00,   0.00000000e+00,   8.06000000e+02,
         1.12910448e+01,   1.00000000e+00,   7.04951385e+01,
         0.00000000e+00,   8.06000000e+02,   7.57462687e+00,
         0.00000000e+00,   6.94358264e+01,   0.00000000e+00,
         7.60000000e+01,   2.60447761e+00,   0.00000000e+00,
         8.50550187e+00,   0.00000000e+00,   8.06000000e+02,
         1.06492537e+01,   1.00000000e+00,   7.02547876e+01,
        -6.90000000e+01,   8.06000000e+02,   4.97014925e+00,
         0.00000000e+00,   6.98505804e+01,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   6.50000000e+01,   1.66000000e+02,
         2.00000000e+00,   0.00000000e+00,   2.40000000e+01,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   1.00000000e+00,   0.00000000e+00])

In [314]:
# Disovering more information about dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40949 entries, 0 to 40948
Data columns (total 54 columns):
0     40949 non-null int64
1     40949 non-null int64
2     40949 non-null int64
3     40949 non-null int64
4     40949 non-null float64
5     40949 non-null float64
6     40949 non-null float64
7     40949 non-null float64
8     40949 non-null float64
9     40949 non-null float64
10    40949 non-null float64
11    40949 non-null float64
12    40949 non-null float64
13    40949 non-null float64
14    40949 non-null float64
15    40949 non-null float64
16    40949 non-null float64
17    40949 non-null float64
18    40949 non-null float64
19    40949 non-null float64
20    40949 non-null float64
21    40949 non-null float64
22    40949 non-null float64
23    40949 non-null float64
24    40949 non-null float64
25    40949 non-null float64
26    40949 non-null float64
27    40949 non-null float64
28    40949 non-null float64
29    40949 non-null int64
30    40949 non-null int64
31    40949 non-null int64
32    40949 non-null int64
33    40949 non-null int64
34    40949 non-null int64
35    40949 non-null int64
36    40949 non-null int64
37    40949 non-null int64
38    40949 non-null int64
39    40949 non-null int64
40    40949 non-null int64
41    40949 non-null int64
42    40949 non-null int64
43    40949 non-null int64
44    40949 non-null int64
45    40949 non-null int64
46    40949 non-null int64
47    40949 non-null int64
48    40949 non-null int64
49    40949 non-null int64
50    40949 non-null int64
51    40949 non-null int64
52    40949 non-null int64
53    40949 non-null int64
dtypes: float64(25), int64(29)
memory usage: 16.9 MB

In [315]:
df.describe()


Out[315]:
0 1 2 3 4 5 6 7 8 9 ... 44 45 46 47 48 49 50 51 52 53
count 4.094900e+04 40949.000000 4.094900e+04 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 ... 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000
mean 1.313814e+06 4676.133752 4.480025e+04 24.254780 1.586241 443.333854 55.720384 35.645535 67.464151 0.219468 ... 0.146157 0.136926 0.141640 0.132506 0.137635 0.148599 0.150846 0.143886 0.144888 7.322889
std 6.785752e+06 20593.184863 1.109338e+05 19.950583 20.753174 496.695198 86.933548 69.960232 81.568249 10.055146 ... 0.353268 0.343774 0.348684 0.339045 0.344520 0.355698 0.357903 0.350979 0.351992 35.494550
min 3.600000e+01 0.000000 0.000000e+00 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 3.673400e+04 0.000000 6.980000e+02 9.000000 0.000000 45.000000 5.527273 2.000000 8.278756 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 2.929110e+05 0.000000 7.045000e+03 18.000000 0.000000 241.000000 23.374101 12.000000 35.069140 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 1.204214e+06 99.000000 5.026400e+04 32.000000 0.000000 717.000000 71.828829 42.000000 102.554954 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.000000
max 4.869723e+08 186370.000000 6.089942e+06 106.000000 2341.000000 2341.000000 2341.000000 2341.000000 731.394558 1923.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1305.000000

8 rows × 54 columns


In [316]:
# Getting unique values of "H Local" Feature
np.unique(X[:,38])


Out[316]:
array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.])

Feature Normalize


In [317]:
# Import preprocessing functions in sklearn
from sklearn import preprocessing

# Normalize dataset features
X_norm =  preprocessing.normalize(X[:,:53],axis= 1, copy= True)
X_temp = X.copy()
X_temp[:,0:53] = X_norm
X_temp[:,53] = X[:,53]
X_norm = X_temp

# Converting X_norm to Dataframe
df_norm = pd.DataFrame(X_norm)

In [318]:
# Analysis data after normalization
df_norm.describe()


Out[318]:
0 1 2 3 4 5 6 7 8 9 ... 44 45 46 47 48 49 50 51 52 53
count 40949.000000 40949.000000 40949.000000 4.094900e+04 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 4.094900e+04 ... 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000 40949.000000
mean 0.983812 0.025267 0.058903 2.217905e-03 0.000008 0.002787 0.000311 0.000179 0.000434 6.565721e-07 ... 0.000010 0.000010 0.000010 0.000009 0.000012 0.000013 0.000011 0.000011 0.000010 7.322889
std 0.067383 0.116617 0.081969 1.217793e-02 0.000256 0.008209 0.000954 0.000683 0.001181 7.319091e-05 ... 0.000111 0.000118 0.000121 0.000103 0.000192 0.000186 0.000148 0.000144 0.000121 35.494550
min 0.047232 0.000000 0.000000 6.148958e-09 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.994518 0.000000 0.007092 8.974377e-06 0.000000 0.000291 0.000036 0.000009 0.000050 0.000000e+00 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.999174 0.000000 0.030617 7.406778e-05 0.000000 0.000776 0.000085 0.000038 0.000111 0.000000e+00 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 0.999874 0.000198 0.072375 6.367665e-04 0.000000 0.002274 0.000250 0.000126 0.000338 0.000000e+00 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.000000
max 1.000000 0.988438 0.566954 7.053838e-01 0.026099 0.103350 0.026099 0.026099 0.019514 1.304934e-02 ... 0.005158 0.006187 0.006187 0.004974 0.021262 0.021482 0.009161 0.011600 0.008462 1305.000000

8 rows × 54 columns

Adding Average Of CC2, CC3, CC4 for later use


In [319]:
# We use this column when we want to plot "Average of Total Comments" vs. "Pages Likes" and etc.
temp = np.mean(X_norm[:,31:34],axis=1)

# Inserting new column before target column
X_norm = np.insert(arr=X_norm,obj=53,values=temp, axis=1)

# Converting new created Matrix to dataframe
df_norm = pd.DataFrame(X_norm)

Convert all of the columns relates to "Post Published Weekday" to pd.Series


In [320]:
weekday = matrixToNumber(df.T[39:46].T)
X = np.insert(arr=X,obj=53,values=weekday, axis=1)
df = pd.DataFrame(X)

Plot the data


In [321]:
fig = plt.figure()

axes1 = fig.add_axes([0, 2, 0.8, 0.8])
axes2 = fig.add_axes([1,2,0.8,0.8])
axes3 = fig.add_axes([0,1,0.8,0.8])
axes4 = fig.add_axes([1,1,0.8,0.8])
axes5 = fig.add_axes([0,0,0.8,0.8])

# "Pages Likes/Popularity" Vs. "Pages Category"
axes1.plot(X_norm[:,35], X_norm[:,0], marker='o', markersize=5, lw= 0)
axes1.set_xlabel("Post Length")
axes1.set_ylabel("Post Likes/Popularity")

# =====> We can conclude when posts become very long, popularity of the posts decrease !!!!

# "Page popularity/Likes" vs. "Share Counts"
axes2.plot(X_norm[:,0], X_norm[:,36], marker='o', markersize=5, lw= 0)
axes2.set_xlabel("Post Likes/Popularity")
axes2.set_ylabel("Post Share Count ")

# ======> We can conclude when posts populartiy increases, the amounts of share increases too.

# "Page talking about" vs. "Share Counts"
axes3.plot(X_norm[:,2], X_norm[:,36], marker='o', markersize=5, lw= 0)
axes3.set_xlabel("Page talking about")
axes3.set_ylabel("Post Share Count")

# ======> There is no good relation that make sense

# "Average of Comments ( Average of CC2, CC3, CC4 )" vs. "Page popularity/likes"
axes4.plot(X_norm[:,0],X_norm[:,53], marker='o', markersize=5, lw= 0)
axes4.set_xlabel("Page popularity/likes")
axes4.set_ylabel("Average of Comments ( Average of CC2, CC3, CC4 )")

# ======> We can conclude that when post popularity increase, the amounts of the comments increase too.

# "Average of Comments ( Average of CC2, CC3, CC4 )" vs. "Page popularity/likes"
axes5.plot(X_norm[:,35],X_norm[:,53], marker='o', markersize=5, lw= 0)
axes5.set_xlabel("Post Length")
axes5.set_ylabel("Average of Comments ( Average of CC2, CC3, CC4 )")

# ======> We can conclude when posts become very long, average numbers of the comments decrease !!!!


Out[321]:
<matplotlib.text.Text at 0xa41beac390>

In [322]:
fig = plt.figure()

axes1 = fig.add_axes([0, 2, 0.8, 0.8])
axes2 = fig.add_axes([1,2,0.8,0.8])
axes3 = fig.add_axes([0,1,0.8,0.8])
axes4 = fig.add_axes([1,1,0.8,0.8])
axes5 = fig.add_axes([0,0,1.8,0.8])

# Before Plotting the data, we should convert all of the columns relates to "Post Published Weekday" to pd.Series
weekday = matrixToNumber(df.T[39:46].T)

# plot "Post published weekday" Countplot
sns.countplot(weekday ,palette='viridis', ax= axes1)
axes1.set_xlabel("Published Weekday")
axes1.set_xticklabels(["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"])

# plot "H local" Countplot
axes2.set_yscale("log")
sns.countplot(df[38].astype('int') ,palette= 'viridis', ax= axes2)
axes2.set_xlabel("H Local")

# =====> We can see that majority of posts published at 24 H Local and Wednesday

# Plot "Promotion" Countplot
axes3.set_yscale("log")
sns.countplot(df[37] ,palette= 'viridis', ax= axes3)
axes3.set_xlabel("Promotion")

# =====> We can see that all of the posts promotion status is 0, So This column has no effect on our predections
# =====> So we can ignore this column

# Plot "Target Variable" Countplot
sns.countplot(df[54].astype('int'), palette= 'viridis', ax= axes4)
axes4.set_yscale("log")
axes4.set_xlabel("Target Varible")

# Plot "Category" Countplot
sns.countplot(df[3].astype('int'),palette= 'viridis', ax= axes5)
axes5.set_xlabel("Page Category")

# ======> We can see 9, 18 and 36 categorie's have occupied majority of posts


Out[322]:
<matplotlib.text.Text at 0xa40da08cc0>

In [323]:
# "Post published weekday" vs. "H local"
plt.plot(weekday,df[38], marker='o', markersize=5, lw= 0)

# =====> We can see except Wendsday and Thursday, most of the post's published in 24 H Local


Out[323]:
[<matplotlib.lines.Line2D at 0xa412a17470>]

In [324]:
fig = plt.figure(figsize=(12,10))

axes1 = fig.add_axes([0, 2, 2, 1])
axes2 = fig.add_axes([0,1,2,1])

# Plot "Category" vs. "Popularity" barplot
axes1.set_yscale('log')
sns.barplot(x="3",y="0",data=df.rename(columns=(lambda x: str(x))).astype('int'), palette='viridis',ax=axes1)
axes1.set_xlabel("Page Category")
axes1.set_ylabel("Page Popularity/Likes")

# =====> We can conclude that "47", "61" and "33" has the majority of the popularity

# Plot "Category" vs. "Share amount" barplot
sns.barplot(x="3",y="36",data=df.rename(columns=(lambda x: str(x))).astype('int'), palette='viridis',ax=axes2)
axes2.set_xlabel("Page Category")
axes2.set_ylabel("Page Share Amounts")
# =====> We can conclude that "47", "61" and "33" has the majority of the Share Amounts


Out[324]:
<matplotlib.text.Text at 0xa40dea0f98>

Predection Section


In [325]:
# Using sklearn for spliting data into train_set and test_set
from sklearn.cross_validation import train_test_split

# Using Sklearn linear regression function for finding best thetas ( Linear Regression with multiple Varibles )
from sklearn.linear_model import LinearRegression

In [326]:
# Before Fitting model, We should get rid of columns that has no effect on our predections
X_norm = np.delete(X_norm, 37, 1)

In [327]:
# Spliting Normalized dataset into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.33, random_state=42)

In [328]:
# Initializing Model
lm = LinearRegression()

In [329]:
# Fitting data to model
lm.fit(X_train,y_train)


Out[329]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [330]:
# Model thetas
lm.coef_


Out[330]:
array([ -1.97259076e-13,  -1.26010313e-13,  -4.30211422e-16,
         1.42811964e-13,  -2.66246535e-11,  -1.40792399e-12,
         1.75120599e-11,  -4.76140988e-12,   2.13526862e-11,
         4.43723313e-11,   1.63138981e-12,  -1.34139799e-11,
        -9.68417399e-12,   4.75381053e-12,  -6.13361841e-11,
        -1.62491033e-12,  -5.47973212e-12,  -8.26207156e-12,
        -3.42591410e-12,   3.07854164e-11,   1.23604476e-12,
        -1.01151792e-11,   3.36087324e-12,  -2.47833111e-11,
        -1.72085435e-12,  -1.63807976e-12,  -7.93424782e-12,
         1.22382193e-11,   4.61429518e-12,   9.11239338e-13,
        -2.52168403e-13,  -1.41959039e-13,  -2.15403977e-13,
        -1.10209330e-13,   4.45187139e-15,  -8.91751446e-14,
         6.09518176e-13,  -1.11607898e-12,   1.40154956e-11,
         1.95080439e-11,   1.67083511e-11,   1.31772849e-11,
         1.44126001e-11,   7.20496133e-12,   6.32849850e-12,
         1.82138656e-11,   1.48197957e-11,   1.13775765e-11,
         1.31492350e-11,   8.10719909e-12,   1.39769934e-11,
         1.17105702e-11,  -1.55857446e-13,   1.00000000e+00])

In [331]:
# Simplest way for evaluating model ( Squared Mean Error )
pred = lm.predict(X_test)
eval_arr = np.sum(np.power( pred - y_test , 2),axis=0) / len(pred)

In [332]:
# Plot predections vs. y_test for better understanding how our model works!
fig = plt.figure(figsize=(12,8))
ax = fig.add_axes([0,0,1,1])

ax.set_xlabel("Predections")
ax.set_ylabel("Test Target Varible")
ax.plot(pred, y_test,'bo')

# ====> We can conclude that our model works perfect


Out[332]:
[<matplotlib.lines.Line2D at 0xa41c738ef0>]

In [333]:
# Evaluating model with sklearn functions
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [334]:
print('Mean Absolute Error:', mean_absolute_error(y_test, pred))
print('Mean Squared Error:',mean_squared_error(y_test, pred))


Mean Absolute Error: 4.71754761292e-15
Mean Squared Error: 1.31265944701e-28

In [335]:
"Precision Of model: {}%". format( ( 1 - mean_absolute_error(y_test,pred) ) * 100 )


Out[335]:
'Precision Of model: 99.99999999999953%'

In [336]:
plotLearningCurves(X_norm,y,500)
# Because There is no continous gap between Train Error and Test Error, So Our Model isn't suffering from high variance
# Becase as we get more data, both train and test errors are decreasing and are very small, So Our model isn't suffering from high bias



In [ ]: