In [1]:
import pandas as pd
In [2]:
ads_csv = pd.read_csv("ad_table.csv.csv")
In [3]:
ads_csv.head()
Out[3]:
In [4]:
ads_csv.shape
Out[4]:
In [5]:
#Explore ad groups
ads_csv.ad.unique()
Out[5]:
In [6]:
#convert date column to date-time object. Create new column dateDT
ads_csv["dateDT"]=pd.to_datetime(ads_csv["date"])
In [7]:
print "unique days" , ads_csv["dateDT"].dt.day.unique()
print "month", ads_csv["dateDT"].dt.month.unique()
In [ ]:
In [8]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [9]:
g = sns.factorplot(x="dateDT", y="shown",hue="ad", data=ads_csv,size=10)
g.set(xticklabels=[])
Out[9]:
In [ ]:
In [ ]:
In [10]:
min(ads_csv["shown"])
Out[10]:
In [11]:
#Look for outliers, the table below gives us an idea of the distribution of variables in our data frame
ads_csv.describe()
Out[11]:
In [12]:
#min of -200.150000 doesn't make sense under total_revenue.(Min) explore more
ads_csv[ads_csv["total_revenue"]<0]
Out[12]:
In [13]:
# filter rows where revenue is negative, there are only 4 rows so ignore them.
ads_csv=ads_csv[ads_csv["total_revenue"] >=0]
In [14]:
#Explore rows where "shown" or the number of times an ad group was displayed equaled 0.
ads_csv[ads_csv["shown"]==0]
Out[14]:
In [15]:
ads_csv=ads_csv.drop(ads_csv[ads_csv["shown"]==0].index)
In [16]:
## Check to see if the desired rows have been dropped. row 1273 has been dropped according to our criteria
# above.
1273 in ads_csv.index
Out[16]:
In [17]:
ad_groups_csv=ads_csv.groupby(['ad']).mean()
ad_groups_csv
Out[17]:
For ad group 25, the average revenue column and converted column is 0. Some ad groups e.g. ad_group 9 and 39 have really low revenue (14.81 and 19.53 dollars respectively) but that is OK.
In [39]:
### Plot all variables and sort by ad_group based on descending revenue. Ad groups 25 and 9 are the least.
import matplotlib.pyplot as plt
%matplotlib inline
## Drop 'shown' and 'clicked' columns to better visualize, just for plotting and to look at Revenue.
ads_group_csv_drop_cols = ad_groups_csv.drop(['shown','clicked'],axis=1)
## plot only the bottom 10 ad_groups based on revenue
ads_groups_csv_plot = ads_group_csv_drop_cols.sort_values(by='total_revenue',ascending= False).ix[30:,:].\
plot(kind='bar', title='Plot of bottom 10 worse performing ad_groups by revenue',figsize=(10, 9))
ads_group_csv_drop_cols_plot= ads_group_csv_drop_cols.sort_values(by='total_revenue',ascending= False).\
plot(kind='bar',title = 'Plot of all ad_groups by revenue',figsize=(10,9))
In order to measure effective campaigns it becomes important to consider the ultimate goals for company XYZ. If goal is to increase brand awareness then consider impressions, also consider number of unique visitors to the XYZ website. If goal is to increase sales then conversion rate, revenue, average order value (revenue/conversions) might be a few metrics to consider.
Assuming that the goals of XYZ are visibility, conversions and building a good user base CPI: Conversions per impression is the metric that will be considered. This takes into account both CTR or Click Through Rate(clicks/ impressions) and CR or Conversion Rate (conversions/clicks). Therefore, CPI is conversions/impressions.
This method does not take into account revenue.
In [19]:
ad_groups_csv["ClickThroRate"]=ad_groups_csv.clicked/ad_groups_csv.shown
ad_groups_csv["ConvPerImpression"]=ad_groups_csv.converted/ad_groups_csv.shown
ad_groups_csv["ConvRate"]=ad_groups_csv.converted/ad_groups_csv.clicked
In [20]:
ads_group_csv_metrics_plot= ad_groups_csv[['ClickThroRate','ConvPerImpression','ConvRate']].\
sort_values(by='ConvPerImpression',ascending= False).\
plot(kind='bar',title = 'Plot of metrics CTR, CPI, CR all ad_groups, sorted by decreasing CPI',figsize=(10,9))
In [21]:
#### Plot ad_group_1
df_ad_group_1=ads_csv.ix[ads_csv["ad"]=="ad_group_1"]
#df_ad_group_1.index=df_ad_group_1["dateDT"]
df_ad_group_1.set_index(df_ad_group_1["dateDT"],inplace =True)
#df_ad_group_1.unstack(level=1)
#df_ad_group_1.plot()
df_ad_group_1.loc[:,"shown"]=df_ad_group_1["shown"].astype('float')
In [22]:
# Time series model
#df_ad_group_1["shown"]
rolmean1 = pd.rolling_mean(df_ad_group_1["shown"],window=5)
rolstd1 = pd.rolling_std(df_ad_group_1["shown"], window=5)
#Plot rolling statistics:
orig = plt.plot(df_ad_group_1["shown"], color='blue',label='Original')
mean = plt.plot(rolmean1, color='red', label='Rolling Mean')
std = plt.plot(rolstd1, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
In [23]:
from statsmodels.tsa.stattools import adfuller
#Perform Dickey-Fuller test:
print 'Results of Dickey-Fuller Test:'
dftest = adfuller(df_ad_group_1["shown"], autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print dfoutput
#### Test statistic is lesser than critical values, hence reject null hypothesis that it is non stationary, hence
### this series is stationary
In [24]:
# Autocorrelation plot and partial autocorrelation plot to determine ARIMA model parameters
from statsmodels.tsa.arima_model import ARIMA
#from pandas.tools.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import acf, pacf
y= df_ad_group_1["shown"]
import statsmodels.tsa.api as smt
smt.graphics.plot_acf(y, lags=20, alpha = 0.5)
smt.graphics.plot_pacf(y, lags=20, alpha =0.5)
Out[24]:
In [25]:
# Prepare training and test set
# We need to forecast 3 weeks ahead (from Nov. 23 to Dec. 15th), so ensure test is atleast 2 weeks long.
series = df_ad_group_1["shown"]
size_train=int(len(series)*0.80)
training_data=series[0:size_train] #Dates from Oct. 1 to Nov. 6
testing_data=series[size_train:len(series)]#Dates from Nov.7 to Nov.22
In [26]:
#prepare model
modelA = ARIMA(training_data, order=(6,0,3))
#fit model on entire series
model_fit = modelA.fit(disp=-1)
print(model_fit.summary())
In [27]:
# plot residual errors
residual = pd.DataFrame(model_fit.resid)
residual.plot()
plt.show()
residual.plot(kind='kde')
print(residual.describe())
In [28]:
### #Helper. This part of code is to determine the dates for test set and future dates out of sample.
import datetime
#start_date = testing_data.index[-1] +datetime.timedelta(days=1)
#end_date=testing_data.index[-1]+datetime.timedelta(days=55)
start_date=testing_data.index[0]
end_date_test=testing_data.index[-1]
end_date=testing_data.index[-1]+datetime.timedelta(days=23)
start_date=start_date.date()
end_date=end_date.date()
end_date_test=end_date_test.date()
start_date_str=str(start_date)
end_date_str=str(end_date)
In [29]:
# multi-step out-of-sample forecast using predict function
df_pred=model_fit.predict(start_date_str, end_date_str, testing_data)
# if differencing use typ ='levels' to get data in orignal scales.
df_pred_test_data = df_pred[start_date_str:end_date_test ] # to compute metrics
df_pred_future=df_pred[end_date_test+datetime.timedelta(days=1):end_date]
#print df_pred
#print df_pred_test_data
#print df_pred_future
In [30]:
# plot original timeseries and ARIMA FITTED values
plt.plot(df_ad_group_1["shown"],color='blue')
plt.plot(model_fit.fittedvalues, color='red')
plt.plot(df_pred_test_data,color='green')
plt.plot(df_pred_future,color='brown')
plt.title('Plot of underlying time series, fitted (red), predicted(green) and future forecasted values(brown)')
Out[30]:
In [31]:
import sklearn.metrics, math
predicted = df_pred_test_data
expected= testing_data
mae = sklearn.metrics.mean_absolute_error(expected, predicted)
mse=sklearn.metrics.mean_squared_error(expected,predicted)
rmse=math.sqrt(mse)
In [32]:
pd.DataFrame({'series':'ad_group_1','mae':[mae],'mse':[mse],'rmse':[rmse], 'predicted_value':df_pred_future[end_date]})
Out[32]:
Fit a trend line and compute slope. The time period to compute slope can be either from the beginning of data or last week or previous 10 days. Here, slope is computed for the entire duration of data for determining trend.
In [33]:
ad_strings = ads_csv["ad"].unique()
import pdb, numpy as np
from scipy import stats
slope_df=dict()
p_value_df=dict()
fig_num=0
for i in ad_strings:
fig_num=fig_num+1
df_ad_group=ads_csv.ix[ads_csv["ad"]==i]
df_ad_group.set_index(df_ad_group["dateDT"],inplace =True)
x=np.arange(len(df_ad_group))
#coeffs = np.polyfit(x,df_ad_group["avg_cost_per_click"], 1)
#print coeffs
#slope = coeffs[0]
#slope_df.update({slope:i})
#y_line = x * coeffs[0] + coeffs[1]
#plt.figure(fig_num)
#plt.plot(x,df_ad_group["avg_cost_per_click"])
#plt.plot(y_line)
#plt.title('Trend line for Ad Revenue by '+ i)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, df_ad_group["avg_cost_per_click"])
y_line = x * slope + intercept
slope_df.update({i:slope})
p_value_df.update({i:p_value})
plt.figure(fig_num)
plt.plot(x,df_ad_group["avg_cost_per_click"])
plt.plot(y_line)
plt.title('Trend line for Ad Revenue by '+ i)
#print "slope 2 " + i , slope
#print "p value" ,p_value
In [34]:
#Print the slope values for each ad group.
slope_df
Out[34]:
From the above values, it appears that the ad groups in the plots above where the trend line appears flat is when the slope value has a decimal place value in the ten-thousandths e.g. 'ad_group_1: 0.000926. It appears increasing when slpove value has a decimal place value in the thousandths 'ad_group_3': 0.002444. It is decreasing when slope has a negative value.
In [35]:
p_value_df
#Looking a the p-values there is no significance in the trend noted except in ad_group_8 and ad_group_32.
Out[35]:
In [ ]:
In [40]:
plt.plot(sorted(slope_df.values()))
Out[40]:
In [ ]:
In [ ]:
In [ ]: