In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
data_list = {}
data_list['data'] = pd.read_csv('data.csv')
data_list['banner'] = pd.read_csv('banner.csv')
data_list['publisher'] = pd.read_csv('publisher.csv')
data_list['data']['day'] = pd.to_datetime(data_list['data']['day'])
data_list['data']['start_time'] = pd.to_datetime(data_list['data']['start_time'])
data_list['data']['end_time'] = pd.to_datetime(data_list['data']['end_time'])
for k, v in data_list.items() :
print(k,'\n\n')
print(k,'Head\n\n',v.head(),'\n','-'*10)
print(k,'Shape\n\n',v.shape,'\n','-'*10)
print(k,'Describe\n\n',v.describe(),'\n','-'*10)
print(k,'info\n\n',v.info(),'\n','-'*10)
print('*'*50)
In [2]:
#(Appendix)
#Dowod na to, ze kazdy banner id jest unikalny:
#Bierzemy ilosc elementow i porownujemy z ilosciom unikalnych elementow:
print('Ilosc wszystkich banner_code: ',len(data_list['banner']['Banner_code']))
print('Ilosc unikalnych banner_code: ',len(data_list['banner']['Banner_code'].unique()))
#I przypisany tylko do jednego produktu:
#Zbior id przypisanych do AAA
AAA_banner_id = data_list['banner'][data_list['banner']['Product'] == 'AAA']['Banner_code'].values
BBB_banner_id = data_list['banner'][data_list['banner']['Product'] == 'BBB']['Banner_code'].values
#Przekroj:
print('Przekroj banner_code dla produktu AAA i BBB: ', np.intersect1d(AAA_banner_id, BBB_banner_id))
#Tak samo z kampaniami (przypisane tylko do produktu):
In [3]:
#(Appendix)
#Dowod na to, ze jedna kampania NIE przypada na jeden product (Appendix):
#Zliczamy ile produktow wystepuje dla danej kampanii. Filtrujemy dane po campaign_code, robimy inner joina z lista bannerow i patrzymy ile jest unikalnych produktow.
#Jasno widac, ze kampanie moga miec zarowno jeden jak i dwa produkty.
Campaign_names = data_list['data']['Campaign_name'].unique()
Campaign_names_bool = data_list['data']['Campaign_name'] == Campaign_names[0]
for campaign in Campaign_names :
Campaign_names_bool = data_list['data']['Campaign_name'] == campaign
temp_data = data_list['data'][Campaign_names_bool]
temp_data = temp_data.merge(data_list['banner'][['Banner_code','Product']],how = 'inner',left_on = 'Banner_code',right_on='Banner_code')
print(campaign,len(temp_data['Product'].unique()) )
In [4]:
# (Appendix)[operacja do zrobienia na poczatku]
# Dowod na to, ze jeden Banner moze miec wiele formatow oraz
# stworzenie Horizontal columny ktora mowi kiedy banner jest horyzontalny:
data_list['banner'][['Banner_code','Banner','Format']]
#this
data_list['banner']['Horizontal'] = data_list['banner']['Format'].str.split('x').map(lambda x : int(x[0]) - int(x[1]) > 0 )
print(data_list['banner'][['Format','Horizontal']])
In [ ]:
In [5]:
#Tworzenie dodatkowej tabeli w celu idenyfikacji "unikalnych" ciaglych pojawien sie danego bannera w ramach kampanii
#Wyszczegolniamy unikalne pojawienie sie bannera, tj w kampanii, w ramach pojawienia sie w ciaglej serii dni (na podstwie ktorej ma zliczane kliki i wyswietlnia)
#tutaj tworze zmienna ktora przetrzymuje ten podzial:
data_list['data']['Unique_appearance']= data_list['data']['Campaign_name'] +data_list['data']['start_time'].apply(str)+data_list['data']['end_time'].apply(str)+data_list['data']['Banner_code']
#tu tworze liste z unikalnymi wartosciami
temp_list = np.sort(data_list['data']['Unique_appearance'].unique())
counter = 0
#teraz przechodze caly dataset i pod kazdy unikalny temp podstawiam liczbe (w celu przejrzystosci)
for temp_val in temp_list : #can be written faster
data_list['data']['Unique_appearance'] = data_list['data']['Unique_appearance'].map(lambda x : counter if x == temp_val else x)
counter = counter + 1
#print(data_list['data']['Unique_appearance'])
#print(data_list['data'])
for ii in range(len(temp_list)) :
temp_data = data_list['data'][data_list['data']['Unique_appearance'] == ii]
if temp_data['day_clk'].sum() != temp_data['clk'].iloc[0] :
print(ii, 'Issue, verify')
# Na podstawie tej petli, mozemy stwierdzic, ze dla 5ciu bannerow mamy sytuacje w ktorej odpalono je dwukrotnie. Z tego powodu ilosc klikow i wyswietlen sie nei zgadza.
ii = 247
temp_data = data_list['data'][data_list['data']['Unique_appearance'] == ii]
print(temp_data)
# Dlatego stworzymy jeszcze raz ta kolumne, ale tym razem dodamy do niej rowniez ilosc klikow (clk) w celu unikalnego rozroznienia tych wypadkow
In [14]:
#tutaj tworze zmienna ktora przetrzymuje ten podzial:
data_list['data']['True_Unique_appearance']= data_list['data']['Campaign_name'] +data_list['data']['start_time'].apply(str)+data_list['data']['end_time'].apply(str)+data_list['data']['Banner_code'] + data_list['data']['imps'].apply(str)
#tu tworze liste z unikalnymi wartosciami
true_temp_list = np.sort(data_list['data']['True_Unique_appearance'].unique())
counter = 0
#teraz przechodze caly dataset i pod kazdy unikalny temp podstawiam liczbe (w celu przejrzystosci)
for temp_val in true_temp_list : #can be written faster
data_list['data']['True_Unique_appearance'] = data_list['data']['True_Unique_appearance'].map(lambda x : counter if x == temp_val else x)
counter = counter + 1
#Puscmy jeszcze raz weryfikacje:
print(true_temp_list.shape)
for ii in range(len(true_temp_list)) :
temp_data = data_list['data'][data_list['data']['True_Unique_appearance'] == ii]
if temp_data['day_clk'].sum() != temp_data['clk'].iloc[0] :
print(ii, 'Issue, verify')
# tutaj dowod na to, ze nie ma scisle okreslonych dlugosci wyswietlania danego bannera:
#print(temp_data.shape[0])
#Tada, zadnych powtorzen
In [7]:
data1 = data_list['data'].merge(data_list['banner'][['Banner_code','Product']],left_on = "Banner_code",right_on = 'Banner_code', how = 'left')
groupby_clicks = data1[['day','day_con','day_clk','Product']].groupby(['Product','day']).sum().reset_index()
groupby_banner_count = data1[['Banner_code','day','Product']].groupby(['Product','day']).count().reset_index()
grouby_clk_banner = groupby_clicks.merge(groupby_banner_count, on = ['Product','day'], how='outer')
grouby_clk_banner.rename(columns = {'Banner_code' : 'Banner_count'},inplace = True)
#print(grouby_clk_banner.head())
for prod in ['AAA', 'BBB'] :
aa_data = grouby_clk_banner[grouby_clk_banner['Product'] == prod]
print(aa_data)
print(aa_data.shape)
#----------------------------------------------------------------------------------------------
plt.plot(aa_data['day'], aa_data['day_clk'],'.-',color ='blue')
plt.xticks(rotation = 'vertical')
plt.plot(aa_data['day'], aa_data['day_con']*100,'.-',label='day_con x 100',color ='green')
plt.xticks(rotation = 'vertical')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Product: '+prod +', number of day clk vs time \n(with day_con multiplied by 100)')
plt.show()
#----------------------------------------------------------------------------------------------
plt.plot(aa_data['day'], aa_data['day_clk'],'.-',color ='blue')
plt.xticks(rotation = 'vertical')
plt.plot(aa_data['day'], aa_data['Banner_count']*100,'.-',label = 'banner_count x 100',color='red')
plt.xticks(rotation = 'vertical')
plt.title('Product: '+prod +', number of day clk vs time \n(with banner_count multiplied by 100)')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
#----------------------------------------------------------------------------------------------
plt.plot(aa_data['day'], aa_data['day_con'],'.-',color ='green')
plt.xticks(rotation = 'vertical')
plt.plot(aa_data['day'], aa_data['Banner_count'],'.-',color='red')
plt.xticks(rotation = 'vertical')
plt.title(prod)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Product: '+prod +', number of day clk AND banner_count vs time')
plt.show()
In [13]:
#Stworzenie dodatkowych column posiadajacych ich magiczne cyferki
data_list['data']['CPM_cost'] = data_list['data']['price']*data_list['data']['imps']/ 1000
data_list['data']['CPC_cost'] = data_list['data']['price']*data_list['data']['clk']
data_list['data']['CPL_CPM'] = data_list['data']['CPM_cost'] / data_list['data']['con1']
data_list['data']['CPL_CPC'] = data_list['data']['CPC_cost'] / data_list['data']['con1']
data_list['data']['CTR'] = data_list['data']['imps'] / data_list['data']['clk']
data_list['data']['CR'] = data_list['data']['clk'] / data_list['data']['con1']
data_list['data']['no days'] = (data_list['data']['end_time'] - data_list['data']['start_time']).apply(lambda x : str(x).split()[0])
# stworzenie dodatkowej df, ktora bedzie rozszczerzeniem banner
filter_col = ['Campaign_name','owner','Banner_code','True_Unique_appearance','imps','clk','con1','CPM_cost','CPC_cost','CPL_CPM','CPL_CPC','CTR','CR','no days']
#print(data_list['data'][filter_col])
data_list['data1'] = data_list['data'][filter_col].merge(data_list['banner'],on=['Banner_code'], how='left')
data_list['data1'].drop_duplicates(inplace=True)
data_list['data1'].reset_index(inplace=True, drop = True)
print(data_list['data1'])