In [1]:
import os
import os.path
from urllib.parse import urlparse
from urllib.parse import parse_qs
from datetime import datetime
%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import boto3
import numpy as np
import pandas as pd
In [2]:
s3 = boto3.resource('s3')
bucket_name = "postie-testing-assets"
In [3]:
test = s3.Bucket(bucket_name)
In [4]:
s3.meta.client.head_bucket(Bucket=bucket_name)
Out[4]:
In [5]:
for key in test.objects.all():
print(key.key)
An alternative is to download the data using
aws s3 cp --recursive s3://my_bucket_name local_folder
-
In [2]:
def convert_list(query_string):
"""Parse the query string of the url into a dictionary.
Handle special cases:
- There is a single query "error=True" which is rewritten to 1 if True, else 0.
- Parsing the query returns a dictionary of key-value pairs. The value is a list.
We must get the list value as an int.
Note: This function may be a bottleneck when processing larger data files.
"""
def handle_error(z, col):
"""
Called in the dictionary comprehension below to handle the "error" key.
"""
if "error" in col:
return 1 if "True" in z else 0
return z
dd = parse_qs(query_string)
return {k: int(handle_error(dd[k][0], k)) for k in dd}
In [23]:
fr = pd.DataFrame()
col_names = ["timestamp", "website_id", "customer_id", "app_version", "placeholder", "checkout_amount", "url"]
data_report = []
item_lists = []
for fname in os.listdir("data"):
ffr = pd.read_csv(os.path.join("data", fname),
header=0, names=col_names,
infer_datetime_format=True, parse_dates=[0])
file_date = fname.split(".")[0]
ffr["file_date"] = file_date
transaction_date = ffr.timestamp.apply(lambda x: x.strftime('%Y-%m-%d')) # reformat transaction timestamp
ffr["transaction_date"] = transaction_date
url_items = ffr.url.apply(lambda x: urlparse(x))
domain_name = url_items.apply(lambda x: x[1])
item_query = url_items.apply(lambda x: x[4])
qq = item_query.apply(lambda x: convert_list)
# handle store.example.com and www.example.com as the same website
ffr["domain_name"] = domain_name.apply(lambda x: x if not "example.com" in x else ".".join(x.split(".")[1:]))
item_query = url_items.apply(lambda x: x[4])
qq = item_query.apply(lambda x: convert_list(x)).apply(pd.Series).fillna(value=0)
item_lists += qq.columns.tolist()
final_fr = ffr.join(qq)
print("date {} has {} sales for rows {} and unique dates {}".format(fname, ffr.checkout_amount.sum(),
ffr.shape[0],
transaction_date.unique().shape[0]))
data_report.append({"file_date": file_date, "sales": ffr.checkout_amount.sum(),
"n_placeholder_nan": sum(ffr.placeholder.isnull()),
"n_rows": ffr.shape[0],
"n_websites": ffr.website_id.unique().shape[0],
"n_customers": ffr.customer_id.unique().shape[0],
"n_app_versions": ffr.app_version.unique().shape[0],
"n_dates": transaction_date.unique().shape[0]})
fr = fr.append(final_fr)
fr.reset_index(drop=True, inplace=True)
item_lists = list(set([item for item in item_lists if not "error" in item]))
fr.shape
Out[23]:
In [8]:
fr.head()
Out[8]:
In [12]:
sns.boxplot(x="transaction_date", y="checkout_amount", data=fr)
Out[12]:
In [317]:
pt = pd.pivot_table(fr, values="checkout_amount", index="transaction_date", columns="domain_name",
aggfunc=[np.sum], margins=False)
Out[317]:
In [9]:
from scipy.stats import linregress
pt = pd.pivot_table(fr, values="checkout_amount", index="transaction_date", columns="domain_name",
aggfunc=[np.sum], margins=False)
pt.columns = ['example.com', 'xyz.com']
pt.index = pd.DatetimeIndex(pt.index)
idx = pd.date_range(pt.index.min(), pt.index.max())
pt = pt.reindex(index=idx)
pt.insert(pt.shape[1],
'row_count',
pt.index.value_counts().sort_index().cumsum())
slope, intercept, r_value, p_value, std_err = linregress(x=pt["row_count"].values,
y=pt["example.com"].values)
pt["example.com regression line"] = intercept + slope * pt["row_count"]
pt[["example.com", "example.com regression line"]].plot()
# ax = sns.regplot(data=pt, x="row_count", y="example.com")
# xticks = ax.get_xticks()
# labels = ['' for item in ax.get_xticklabels()]
# labels[1] = pt.index[0].date()
# labels[5] = pt.index[1].date()
# labels[9] = pt.index[2].date()
# ax.set_xticklabels(labels)
# plt.xticks(rotation=40)
# plt.xlabel("Transaction Date")
# plt.show()
# pt
import seaborn as sns
corr = dataframe.corr()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
Out[9]:
In [344]:
slope, intercept, r_value, p_value, std_err = linregress(x=ax.get_lines()[0].get_xdata(),y=ax.get_lines()[0].get_ydata())
print("Predicted sales 7/4/2014 is {}".format(intercept + slope * 4))
In [375]:
ax.get_xticks()
Out[375]:
In [376]:
pt.row_count
Out[376]:
In [ ]:
In [18]:
def convert_list(x):
def handle_error(z, col):
if "error" in col:
return 1 if "True" in z else 0
else:
return z
dd = parse_qs(x)
return {k: int(handle_error(dd[k][0], k)) for k in dd}
In [35]:
url_results = fr.url.apply(lambda x: urlparse(x))
domain_names = url_results.apply(lambda x: x[1])
item_query = url_results.apply(lambda x: x[4])
qq = item_query.apply(lambda x: convert_list(x))
qq = qq.apply(pd.Series).fillna(value=0)
In [ ]:
In [ ]:
In [33]:
dd = parse_qs(item_query[20])
ddd = {k: int(dd[k][0]) for k in dd}
ddd
# for k in dd:
# dd[k] = int(dd[k][0])
Out[33]:
In [36]:
qq.head()
Out[36]:
In [95]:
for it in item_query:
if "True" in it:
print(it)
In [11]:
url_results.shape
Out[11]:
In [116]:
url_results = pd.concat([fr.timestamp, fr.customer_id, qq], axis=1)
In [117]:
url_results.head(10)
Out[117]:
In [27]:
int(url_results.url[0]["Ume"][0])
Out[27]:
In [30]:
url_results["url"].apply(pd.Series) #.apply(lambda x: int(0 if np.isnan(x[0]) else x[0])) #.fillna(value=0) #.apply(lambda x: int(x[0]))
Out[30]:
In [31]:
new_cols = url_results["url"].apply(pd.Series).fillna(value=0)
In [56]:
Out[56]:
In [49]:
new_cols.shape
Out[49]:
In [57]:
for col in new_cols.columns:
print(new_cols[col].unique())
In [44]:
int(new_cols["Bignay"][4][0])
Out[44]:
In [53]:
def convert_to_num(x):
if isinstance(x, list):
try:
xx = int(x[0])
except Exception as ex:
print(x[0])
return int(x[0])
else:
return x
In [54]:
frames = [new_cols[col].apply(convert_to_num) for col in new_cols.columns]
In [124]:
pr = urlparse("http://xyz.com/checkout?Ume=1")
In [127]:
pr
Out[127]:
In [37]:
fr.head()
Out[37]:
In [30]:
item_cols = new_cols.columns.tolist()
In [13]:
fr[item_lists + ['checkout_amount', 'timestamp', 'domain_name', 'transaction_date', 'customer_id']]
Out[13]:
In [64]:
# data structure
# - transaction_timestamp
# - domain name
# - item
# - price
#ffr = fr[item_lists + ['checkout_amount', 'timestamp', 'domain_name', 'transaction_date', 'customer_id']]
fr[fr[item_lists].astype(bool).sum(axis=1) == 1].drop(["error"], axis=1).head()
Out[64]:
In [183]:
cols = [item for item in item_lists if not "error" in item]
pricing_temp = fr[fr[cols].astype(bool).sum(axis=1) == 1].drop(["error"], axis=1)
pricing_temp.drop_duplicates(subset=cols + ["domain_name", "transaction_date"], inplace=True)
pricing_temp
Out[183]:
In [180]:
pricing_temp.transpose()
Out[180]:
In [191]:
# data structure
# - transaction_timestamp
# - domain name
# - item
# - price
price_cols = []
for col in cols:
price_cols.append(pricing_temp["checkout_amount"]/pricing_temp[col])
pricing = pd.concat(price_cols, axis=1)
pricing.columns = cols
price_cols = [col + "_price" for col in cols]
dfr = pricing_temp.join(pricing, rsuffix="_price")[price_cols + ["transaction_date", "domain_name"]]
dfr = dfr.replace([np.inf, -np.inf], np.nan).fillna(value=0)
pd.pivot_table(dfr, values=price_cols, index="transaction_date", columns="domain_name", aggfunc=np.max).transpose()
# pd.pivot_table(dfr, values=price_cols, index=["transaction_date"], columns=["domain_name"])
# dfr = dfr.replace([np.inf, -np.inf], np.nan).fillna(value=0)
# dfrt = dfr.transpose()
# t_new_cols = ["col_" + str(col) for col in dfrt.columns.tolist()]
# dfrt.columns = t_new_cols
# dfrt = dfrt.reset_index(drop=False).rename(columns={"index": "temp"})
# dfrt.transpose()
# pd.pivot_table(dfrt, columns=["transaction_date", "domain_name"], values=t_new_cols, index="temp")
# dfr["0"]
# px_cols = [col for col in dfr.columns if not "temp" in col]
# px_cols
# pd.pivot_table(dfr, columns=index, index=["transaction_date", "domain_nam"], values=px_cols, aggfunc=np.max)
# pricing["checkout_amount"].div(pricing[cols], axis="index")
# pricing["checkout_amount"] #.div(pricing[cols], axis=0)
# there are now values > 1 for the items, so we replace them with 1
# pricing[cols] > 1
# pricing[pricing[cols] > 1]
# pricing.transpose()
Out[191]:
In [74]:
dd = {}
for col in item_lists:
if not "error" in col:
dd[col + "_price"] = np.max(pricing[col] * pricing["checkout_amount"])
dd
Out[74]:
In [30]:
pd.pivot_table(pricing, values="checkout_amount", index=, columns="transaction_date")
Out[30]:
In [38]:
pricing.transpose()
Out[38]:
In [18]:
pd.pivot_table(fr, values="checkout_amount", index="transaction_date", columns="domain_name",
aggfunc=[np.sum], margins=True)
In [19]:
Out[19]:
In [20]:
# print(df.loc[df['B'].isin(['one','three'])])
for col in dfr.columns.tolist():
dfr[col]
In [21]:
two = pd.DataFrame({'one' : pd.Series([10, 0, 10, 9], index=['a', 'b', 'c', 'd']), 'two' : pd.Series([0, 0, 10., 4.6], index=['a', 'b', 'c', 'd']), 'three' : pd.Series([5, -1, 7, -1], index=['a', 'b', 'c', 'd'])})
In [22]:
two
Out[22]:
In [26]:
dfr[dfr.astype(bool).sum(axis=1) == 1]
Out[26]:
In [25]:
gr = fr[item_lists + ["transaction_date", "domain_name", "checkout_amount"]]
In [26]:
frame = fr[item_lists + ["transaction_date", "domain_name", "checkout_amount"]]
gr = frame.groupby(["transaction_date", "domain_name"])
In [33]:
gb_frames = []
for name, group in gr:
gb_frames.append({"name": name, "frame": group})
In [35]:
gb_frames[0]["name"][0]
Out[35]:
In [37]:
corr = gb_frames[0]["frame"][item_lists].corr()
ax = sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values, linewidths=.5,
cmap="YlGnBu")
ax = plt.title("Test")
plt.tight_layout()
In [ ]: