In [1]:
import os
import os.path

from urllib.parse import urlparse
from urllib.parse import parse_qs
from datetime import datetime

%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import boto3
import numpy as np
import pandas as pd

Data Challenge

Files are stored in an S3 bucket. The purpose here is to work out methods to process the data properly. There is a lot of scratch work that is not explained, but there are some observations about the data below.

Bucket Contents and File Download


In [2]:
s3 = boto3.resource('s3')
bucket_name = "postie-testing-assets"

In [3]:
test = s3.Bucket(bucket_name)

In [4]:
s3.meta.client.head_bucket(Bucket=bucket_name)


Out[4]:
{'ResponseMetadata': {'HTTPHeaders': {'content-type': 'application/xml',
   'date': 'Wed, 18 Oct 2017 14:16:20 GMT',
   'server': 'AmazonS3',
   'transfer-encoding': 'chunked',
   'x-amz-bucket-region': 'us-east-1',
   'x-amz-id-2': 'r0MvQsrnNPv2cYn5roGk88QwMSU4hFswFDE28rPwcAYH73jx3Mb1uR7rNVrDQ1OOgRGN8AeIQWQ=',
   'x-amz-request-id': '6A90346B37412C6E'},
  'HTTPStatusCode': 200,
  'HostId': 'r0MvQsrnNPv2cYn5roGk88QwMSU4hFswFDE28rPwcAYH73jx3Mb1uR7rNVrDQ1OOgRGN8AeIQWQ=',
  'RequestId': '6A90346B37412C6E',
  'RetryAttempts': 0}}

In [5]:
for key in test.objects.all():
    print(key.key)


2017-07-01.csv
2017-07-02.csv
2017-07-03.csv

An alternative is to download the data using

aws s3 cp --recursive s3://my_bucket_name local_folder

-

Observations

  1. Analyst is correct - July 3rd file has sales of $164,065. There are fewer sales and customers on the 3rd than the other two days, so the sales amount is lower. However, there are problems with the data files, so there are sales recorded on the 3rd found in files labeled for the 2nd and 3rd. There are sales recorded on the 2nd found in the files labeled for the 1st and 2nd. To get accurate sales data we will have to group by transaction date - the sales date parsed from the timestamp field.
  2. The placeholder field is empty for two files, so it's probably not meaningful. Regardless, it will not add anything to prediction.
  3. app_version may or may not be meaningful. There is only one app_version for the files labeled "2017-07-01" and "2017-07-02". The remaining file has two app_versions - 1.1 and 1.2. This field is probably not meaningful for prediction, but sales on the 3rd should be examined for the different app_versions.

Plans

  • Get the items. But should we make new columns in the original dataset?
  • Items are paired with item counts.
  • We would like to know the sales by domain and by item.
  • We would like to know the corrected sales by date.
  • Average sales is not meaningful - explain why.
  • Sales prediction at the aggregate level is not meaningful - there are only two values per domain.
  • So do we predict sales at the item level? If so, how?
  • What do we do with the app_version?
  • What are the values in placeholder?
  • Explain why average sales for a day is not meaningful.

In [2]:
def convert_list(query_string):
    """Parse the query string of the url into a dictionary.
    
    Handle special cases:
    - There is a single query "error=True" which is rewritten to 1 if True, else 0.
    - Parsing the query returns a dictionary of key-value pairs. The value is a list.
      We must get the list value as an int.
      
    Note: This function may be a bottleneck when processing larger data files.
    """
    def handle_error(z, col):
        """
        Called in the dictionary comprehension below to handle the "error" key.
        """
        if "error" in col:
            return 1 if "True" in z else 0
        return z
    dd = parse_qs(query_string)
    return {k: int(handle_error(dd[k][0], k)) for k in dd}

In [23]:
fr = pd.DataFrame()
col_names = ["timestamp", "website_id", "customer_id", "app_version", "placeholder", "checkout_amount", "url"]
data_report = []
item_lists = []
for fname in os.listdir("data"):
    ffr = pd.read_csv(os.path.join("data", fname),
                      header=0, names=col_names,
                      infer_datetime_format=True, parse_dates=[0])
    file_date = fname.split(".")[0]
    ffr["file_date"] = file_date
    transaction_date = ffr.timestamp.apply(lambda x: x.strftime('%Y-%m-%d'))  # reformat transaction timestamp
    ffr["transaction_date"] = transaction_date
    url_items = ffr.url.apply(lambda x: urlparse(x))
    domain_name = url_items.apply(lambda x: x[1])
    item_query = url_items.apply(lambda x: x[4])
    qq = item_query.apply(lambda x: convert_list)
    # handle store.example.com and www.example.com as the same website
    ffr["domain_name"] = domain_name.apply(lambda x: x if not "example.com" in x else ".".join(x.split(".")[1:]))
    item_query = url_items.apply(lambda x: x[4])
    qq = item_query.apply(lambda x: convert_list(x)).apply(pd.Series).fillna(value=0)
    item_lists += qq.columns.tolist()
    final_fr = ffr.join(qq)
    print("date {} has {} sales for rows {} and unique dates {}".format(fname, ffr.checkout_amount.sum(),
                                                                        ffr.shape[0],
                                                                        transaction_date.unique().shape[0]))
    data_report.append({"file_date": file_date, "sales": ffr.checkout_amount.sum(),
                        "n_placeholder_nan": sum(ffr.placeholder.isnull()),
                        "n_rows": ffr.shape[0],
                        "n_websites": ffr.website_id.unique().shape[0],
                        "n_customers": ffr.customer_id.unique().shape[0],
                        "n_app_versions": ffr.app_version.unique().shape[0],
                        "n_dates": transaction_date.unique().shape[0]})
    fr = fr.append(final_fr)
fr.reset_index(drop=True, inplace=True)
item_lists = list(set([item for item in item_lists if not "error" in item]))
fr.shape


date 2017-07-02.csv has 183294.0 sales for rows 11573 and unique dates 2
date 2017-07-01.csv has 241491.0 sales for rows 11634 and unique dates 2
date 2017-07-03.csv has 164065.0 sales for rows 9981 and unique dates 1
Out[23]:
(33188, 21)

In [8]:
fr.head()


Out[8]:
Bignay Black/White Pepper European Grape Hazelnut Mabolo Natal Orange Prairie Potato Round Kumquat Ume Ylang-ylang ... checkout_amount customer_id domain_name error file_date placeholder timestamp transaction_date url website_id
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 6.0 9418 example.com NaN 2017-07-02 NaN 2017-07-02 07:00:35 2017-07-02 http://www.example.com/store/?Ume=1 123
1 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 3.0 3872 xyz.com NaN 2017-07-02 NaN 2017-07-02 07:00:37 2017-07-02 http://xyz.com/checkout?Prairie+Potato=1 124
2 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 4.0 3090 example.com NaN 2017-07-02 NaN 2017-07-02 07:00:47 2017-07-02 http://www.example.com/store/?Hazelnut=1 123
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 6.0 9556 xyz.com NaN 2017-07-02 NaN 2017-07-02 00:00:51 2017-07-02 http://xyz.com/checkout?Ume=1 124
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 6.0 8845 xyz.com NaN 2017-07-02 NaN 2017-07-02 00:01:02 2017-07-02 http://xyz.com/checkout?Bignay=1 124

5 rows × 21 columns


In [12]:
sns.boxplot(x="transaction_date", y="checkout_amount", data=fr)


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f99bcef21d0>

In [317]:
pt = pd.pivot_table(fr, values="checkout_amount", index="transaction_date", columns="domain_name",
               aggfunc=[np.sum], margins=False)


Out[317]:
sum
domain_name example.com xyz.com
transaction_date
2017-07-01 172874.0 50641.0
2017-07-02 127786.0 55966.0
2017-07-03 64599.0 116984.0

In [9]:
from scipy.stats import linregress
pt = pd.pivot_table(fr, values="checkout_amount", index="transaction_date", columns="domain_name",
               aggfunc=[np.sum], margins=False)
pt.columns = ['example.com', 'xyz.com']
pt.index = pd.DatetimeIndex(pt.index)
idx = pd.date_range(pt.index.min(), pt.index.max())
pt = pt.reindex(index=idx)
pt.insert(pt.shape[1],
          'row_count',
          pt.index.value_counts().sort_index().cumsum())
slope, intercept, r_value, p_value, std_err = linregress(x=pt["row_count"].values,
                                                         y=pt["example.com"].values)
pt["example.com regression line"] = intercept + slope * pt["row_count"]
pt[["example.com", "example.com regression line"]].plot()
# ax = sns.regplot(data=pt, x="row_count", y="example.com")
# xticks = ax.get_xticks()
# labels = ['' for item in ax.get_xticklabels()]
# labels[1] = pt.index[0].date()
# labels[5] = pt.index[1].date()
# labels[9] = pt.index[2].date()
# ax.set_xticklabels(labels)
# plt.xticks(rotation=40)
# plt.xlabel("Transaction Date")
# plt.show()
# pt

import seaborn as sns
corr = dataframe.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7ff944d2e048>

In [344]:
slope, intercept, r_value, p_value, std_err = linregress(x=ax.get_lines()[0].get_xdata(),y=ax.get_lines()[0].get_ydata())
print("Predicted sales 7/4/2014 is {}".format(intercept + slope * 4))


Predicted sales 7/4/2014 is 13478.000000000204

In [375]:
ax.get_xticks()


Out[375]:
array([ 0.75,  1.  ,  1.25,  1.5 ,  1.75,  2.  ,  2.25,  2.5 ,  2.75,
        3.  ,  3.25])

In [376]:
pt.row_count


Out[376]:
2017-07-01    1
2017-07-02    2
2017-07-03    3
Freq: D, Name: row_count, dtype: int64

In [ ]:


In [18]:
def convert_list(x):
    def handle_error(z, col):
        if "error" in col:
            return 1 if "True" in z else 0
        else:
            return z
    dd = parse_qs(x)
    return {k: int(handle_error(dd[k][0], k)) for k in dd}

In [35]:
url_results = fr.url.apply(lambda x: urlparse(x))
domain_names = url_results.apply(lambda x: x[1])
item_query = url_results.apply(lambda x: x[4])
qq = item_query.apply(lambda x: convert_list(x))
qq = qq.apply(pd.Series).fillna(value=0)

In [ ]:


In [ ]:


In [33]:
dd = parse_qs(item_query[20])
ddd = {k: int(dd[k][0]) for k in dd}
ddd
# for k in dd:
#     dd[k] = int(dd[k][0])


Out[33]:
{'Round Kumquat': 1, 'Ume': 1}

In [36]:
qq.head()


Out[36]:
Bignay Black/White Pepper European Grape Hazelnut Mabolo Natal Orange Prairie Potato Round Kumquat Ume Ylang-ylang error
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

In [95]:
for it in item_query:
    if "True" in it:
        print(it)


Bignay=1&error=True

In [11]:
url_results.shape


Out[11]:
(33188,)

In [116]:
url_results = pd.concat([fr.timestamp, fr.customer_id, qq], axis=1)

In [117]:
url_results.head(10)


Out[117]:
timestamp customer_id Bignay Black/White Pepper European Grape Hazelnut Mabolo Natal Orange Prairie Potato Round Kumquat Ume Ylang-ylang error
0 2017-07-02 07:00:35 9418 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
1 2017-07-02 07:00:37 3872 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
2 2017-07-02 07:00:47 3090 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 2017-07-02 00:00:51 9556 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
4 2017-07-02 00:01:02 8845 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 2017-07-02 00:01:14 6008 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
6 2017-07-02 00:01:18 5312 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
7 2017-07-02 07:01:33 9178 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8 2017-07-02 07:01:55 1680 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
9 2017-07-02 07:01:55 275 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0

In [27]:
int(url_results.url[0]["Ume"][0])


Out[27]:
1

In [30]:
url_results["url"].apply(pd.Series) #.apply(lambda x: int(0 if np.isnan(x[0]) else x[0])) #.fillna(value=0) #.apply(lambda x: int(x[0]))


Out[30]:
Bignay Black/White Pepper European Grape Hazelnut Mabolo Natal Orange Prairie Potato Round Kumquat Ume Ylang-ylang error
0 NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN
1 NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN
2 NaN NaN NaN [1] NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN
4 [1] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 NaN [1] NaN NaN NaN NaN NaN NaN NaN NaN NaN
6 NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN NaN
7 [1] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
8 NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN
9 NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN
10 NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN
11 [1] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
12 [1] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13 [1] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
14 NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN
15 [1] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
16 NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN
17 NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN NaN
18 NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN
19 NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN
20 NaN NaN NaN NaN NaN NaN NaN [1] [1] NaN NaN
21 NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN NaN
22 NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN
23 NaN [1] [1] [3] [2] [1] [1] [1] NaN NaN NaN
24 NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN
25 NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN
26 NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN
27 NaN NaN [1] NaN NaN NaN NaN NaN NaN NaN NaN
28 [1] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
29 [1] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ...
33158 [1] [1] [1] NaN [1] [1] [1] [2] NaN [2] NaN
33159 NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN NaN
33160 NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN
33161 [1] [1] NaN NaN NaN NaN NaN NaN NaN NaN NaN
33162 NaN NaN [1] NaN NaN NaN NaN NaN NaN NaN NaN
33163 NaN NaN NaN [1] NaN NaN NaN NaN NaN NaN NaN
33164 NaN NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN
33165 NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN NaN
33166 NaN NaN [1] NaN NaN NaN NaN NaN NaN NaN NaN
33167 NaN NaN NaN NaN NaN NaN [1] NaN [1] NaN NaN
33168 NaN NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN
33169 NaN NaN NaN [1] NaN NaN NaN NaN [1] NaN NaN
33170 NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN NaN
33171 [1] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
33172 NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN
33173 NaN NaN NaN [1] NaN NaN NaN NaN NaN NaN NaN
33174 NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN
33175 NaN NaN NaN NaN [1] NaN NaN NaN NaN NaN NaN
33176 NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN
33177 NaN NaN NaN NaN [1] NaN NaN NaN NaN NaN NaN
33178 NaN NaN NaN NaN [1] NaN NaN NaN NaN NaN NaN
33179 NaN NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN
33180 NaN NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN
33181 NaN NaN [1] NaN NaN NaN NaN NaN NaN NaN NaN
33182 NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN
33183 NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN
33184 [2] [2] NaN [2] NaN NaN NaN [3] [1] NaN NaN
33185 NaN NaN NaN NaN NaN NaN [1] NaN NaN NaN NaN
33186 NaN NaN NaN NaN NaN NaN NaN NaN [1] NaN NaN
33187 NaN NaN NaN NaN [1] NaN NaN NaN [1] NaN NaN

33188 rows × 11 columns


In [31]:
new_cols = url_results["url"].apply(pd.Series).fillna(value=0)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

TypeError: an integer is required

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-31-88cd69044e19> in <module>()
----> 1 new_cols = url_results["url"].apply(pd.Series).fillna(value=0)

~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
    599         key = com._apply_if_callable(key, self)
    600         try:
--> 601             result = self.index.get_value(self, key)
    602 
    603             if not is_scalar(result):

~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   2475         try:
   2476             return self._engine.get_value(s, k,
-> 2477                                           tz=getattr(series.dtype, 'tz', None))
   2478         except KeyError as e1:
   2479             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

KeyError: 'url'

In [56]:



Out[56]:
Bignay Black/White Pepper European Grape Hazelnut Mabolo Natal Orange Prairie Potato Round Kumquat Ume Ylang-ylang error
0 0 0 0 0 0 0 0 0 [1] 0 0
1 0 0 0 0 0 0 [1] 0 0 0 0
2 0 0 0 [1] 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 [1] 0 0
4 [1] 0 0 0 0 0 0 0 0 0 0

In [49]:
new_cols.shape


Out[49]:
(33188, 11)

In [57]:
for col in new_cols.columns:
    print(new_cols[col].unique())


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-57-6c31b99a16ce> in <module>()
      1 for col in new_cols.columns:
----> 2     print(new_cols[col].unique())

~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in unique(self)
   1192     @Appender(base._shared_docs['unique'] % _shared_doc_kwargs)
   1193     def unique(self):
-> 1194         result = super(Series, self).unique()
   1195 
   1196         if is_datetime64tz_dtype(self.dtype):

~/anaconda3/lib/python3.6/site-packages/pandas/core/base.py in unique(self)
    970         else:
    971             from pandas.core.algorithms import unique1d
--> 972             result = unique1d(values)
    973 
    974         return result

~/anaconda3/lib/python3.6/site-packages/pandas/core/algorithms.py in unique(values)
    354 
    355     table = htable(len(values))
--> 356     uniques = table.unique(values)
    357     uniques = _reconstruct_data(uniques, dtype, original)
    358 

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.unique()

TypeError: unhashable type: 'list'

In [44]:
int(new_cols["Bignay"][4][0])


Out[44]:
1

In [53]:
def convert_to_num(x):
    if isinstance(x, list):
        try:
            xx = int(x[0])
        except Exception as ex:
            print(x[0])
        return int(x[0])
    else:
        return x

In [54]:
frames = [new_cols[col].apply(convert_to_num) for col in new_cols.columns]


True
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-54-b6750dee2c60> in <module>()
----> 1 frames = [new_cols[col].apply(convert_to_num) for col in new_cols.columns]

<ipython-input-54-b6750dee2c60> in <listcomp>(.0)
----> 1 frames = [new_cols[col].apply(convert_to_num) for col in new_cols.columns]

~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
   2353             else:
   2354                 values = self.asobject
-> 2355                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   2356 
   2357         if len(mapped) and isinstance(mapped[0], Series):

pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()

<ipython-input-53-cfa6290d7d71> in convert_to_num(x)
      5         except Exception as ex:
      6             print(x[0])
----> 7         return int(x[0])
      8     else:
      9         return x

ValueError: invalid literal for int() with base 10: 'True'

In [124]:
pr = urlparse("http://xyz.com/checkout?Ume=1")

In [127]:
pr


Out[127]:
ParseResult(scheme='http', netloc='xyz.com', path='/checkout', params='', query='Ume=1', fragment='')

In [37]:
fr.head()


Out[37]:
Bignay Black/White Pepper European Grape Hazelnut Mabolo Natal Orange Prairie Potato Round Kumquat Ume Ylang-ylang ... checkout_amount customer_id domain_name error file_date placeholder timestamp transaction_date url website_id
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 6.0 9418 example.com NaN 2017-07-02 NaN 2017-07-02 07:00:35 2017-07-02 http://www.example.com/store/?Ume=1 123
1 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 3.0 3872 xyz.com NaN 2017-07-02 NaN 2017-07-02 07:00:37 2017-07-02 http://xyz.com/checkout?Prairie+Potato=1 124
2 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 4.0 3090 example.com NaN 2017-07-02 NaN 2017-07-02 07:00:47 2017-07-02 http://www.example.com/store/?Hazelnut=1 123
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 6.0 9556 xyz.com NaN 2017-07-02 NaN 2017-07-02 00:00:51 2017-07-02 http://xyz.com/checkout?Ume=1 124
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 6.0 8845 xyz.com NaN 2017-07-02 NaN 2017-07-02 00:01:02 2017-07-02 http://xyz.com/checkout?Bignay=1 124

5 rows × 21 columns


In [30]:
item_cols = new_cols.columns.tolist()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-30-02c2ae1757b7> in <module>()
----> 1 item_cols = new_cols.columns.tolist()

NameError: name 'new_cols' is not defined

In [13]:
fr[item_lists + ['checkout_amount', 'timestamp', 'domain_name', 'transaction_date', 'customer_id']]


Out[13]:
Bignay Hazelnut Ylang-ylang Mabolo Ume Natal Orange Prairie Potato Black/White Pepper Round Kumquat error European Grape checkout_amount timestamp domain_name transaction_date customer_id
0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 07:00:35 example.com 2017-07-02 9418
1 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 NaN 0.0 3.0 2017-07-02 07:00:37 xyz.com 2017-07-02 3872
2 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 4.0 2017-07-02 07:00:47 example.com 2017-07-02 3090
3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 00:00:51 xyz.com 2017-07-02 9556
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 00:01:02 xyz.com 2017-07-02 8845
5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 NaN 0.0 5.0 2017-07-02 00:01:14 example.com 2017-07-02 6008
6 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 00:01:18 xyz.com 2017-07-02 5312
7 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 07:01:33 xyz.com 2017-07-02 9178
8 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 07:01:55 xyz.com 2017-07-02 1680
9 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 NaN 0.0 3.0 2017-07-02 07:01:55 example.com 2017-07-02 275
10 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 NaN 0.0 3.0 2017-07-02 07:02:02 example.com 2017-07-02 1073
11 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 07:02:02 xyz.com 2017-07-02 7222
12 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 00:02:08 example.com 2017-07-02 636
13 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 00:02:19 example.com 2017-07-02 6854
14 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 NaN 0.0 3.0 2017-07-02 07:02:26 xyz.com 2017-07-02 440
15 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 00:02:29 xyz.com 2017-07-02 6542
16 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 NaN 0.0 7.0 2017-07-02 07:02:31 example.com 2017-07-02 1953
17 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 07:02:34 xyz.com 2017-07-02 1440
18 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 07:02:41 xyz.com 2017-07-02 9350
19 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 NaN 0.0 7.0 2017-07-02 07:02:53 xyz.com 2017-07-02 3660
20 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 NaN 0.0 13.0 2017-07-02 07:02:56 example.com 2017-07-02 10778
21 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 00:02:57 example.com 2017-07-02 9246
22 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 NaN 0.0 3.0 2017-07-02 07:03:15 example.com 2017-07-02 3787
23 0.0 3.0 0.0 2.0 0.0 1.0 1.0 1.0 1.0 NaN 1.0 54.0 2017-07-02 00:03:43 xyz.com 2017-07-02 11410
24 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 NaN 0.0 7.0 2017-07-02 07:03:48 example.com 2017-07-02 6977
25 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 NaN 0.0 3.0 2017-07-02 00:03:49 example.com 2017-07-02 5198
26 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 NaN 0.0 3.0 2017-07-02 07:04:10 example.com 2017-07-02 2532
27 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 1.0 5.0 2017-07-02 00:04:16 example.com 2017-07-02 2619
28 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 07:04:22 example.com 2017-07-02 5145
29 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 6.0 2017-07-02 07:04:25 example.com 2017-07-02 1342
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
33158 1.0 0.0 2.0 1.0 0.0 1.0 1.0 1.0 2.0 0.0 1.0 57.0 2017-07-03 23:53:08 xyz.com 2017-07-03 11271
33159 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 6.0 2017-07-03 23:53:12 xyz.com 2017-07-03 80
33160 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 7.0 2017-07-03 23:53:29 example.com 2017-07-03 4701
33161 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 11.0 2017-07-03 23:54:06 xyz.com 2017-07-03 10112
33162 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 5.0 2017-07-03 23:54:09 xyz.com 2017-07-03 9739
33163 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 2017-07-03 23:54:26 example.com 2017-07-03 8573
33164 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0 2017-07-03 23:55:06 xyz.com 2017-07-03 4842
33165 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 6.0 2017-07-03 23:55:20 xyz.com 2017-07-03 7147
33166 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 5.0 2017-07-03 23:55:42 example.com 2017-07-03 4902
33167 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 9.0 2017-07-03 23:55:52 example.com 2017-07-03 10669
33168 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0 2017-07-03 23:56:01 xyz.com 2017-07-03 6206
33169 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 2017-07-03 23:56:11 xyz.com 2017-07-03 10419
33170 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 6.0 2017-07-03 23:56:50 xyz.com 2017-07-03 72
33171 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 2017-07-03 23:57:00 example.com 2017-07-03 900
33172 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 2017-07-03 23:57:14 xyz.com 2017-07-03 5124
33173 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.0 2017-07-03 23:57:18 xyz.com 2017-07-03 384
33174 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 2017-07-03 23:57:45 example.com 2017-07-03 4741
33175 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8.0 2017-07-03 23:57:48 example.com 2017-07-03 2697
33176 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 7.0 2017-07-03 23:57:53 example.com 2017-07-03 6036
33177 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8.0 2017-07-03 23:58:04 xyz.com 2017-07-03 6448
33178 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8.0 2017-07-03 23:58:06 xyz.com 2017-07-03 6817
33179 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0 2017-07-03 23:58:15 example.com 2017-07-03 566
33180 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0 2017-07-03 23:58:17 xyz.com 2017-07-03 3674
33181 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 5.0 2017-07-03 23:58:30 example.com 2017-07-03 6560
33182 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 7.0 2017-07-03 23:58:38 example.com 2017-07-03 3712
33183 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 2017-07-03 23:59:10 xyz.com 2017-07-03 3158
33184 2.0 2.0 0.0 0.0 1.0 0.0 0.0 2.0 3.0 0.0 0.0 57.0 2017-07-03 23:59:19 xyz.com 2017-07-03 11189
33185 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 3.0 2017-07-03 23:59:32 xyz.com 2017-07-03 7730
33186 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 2017-07-03 23:59:34 xyz.com 2017-07-03 8204
33187 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 14.0 2017-07-03 23:59:38 xyz.com 2017-07-03 10245

33188 rows × 16 columns


In [64]:
# data structure
# - transaction_timestamp
# - domain name
# - item
# - price
#ffr = fr[item_lists + ['checkout_amount', 'timestamp', 'domain_name', 'transaction_date', 'customer_id']]
fr[fr[item_lists].astype(bool).sum(axis=1) == 1].drop(["error"], axis=1).head()


Out[64]:
Bignay Black/White Pepper European Grape Hazelnut Mabolo Natal Orange Prairie Potato Round Kumquat Ume Ylang-ylang app_version checkout_amount customer_id domain_name file_date placeholder timestamp transaction_date url website_id
23207 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.1 5.0 4877 xyz.com 2017-07-03 NaN 2017-07-03 07:00:12 2017-07-03 http://xyz.com/checkout?Ylang-ylang=1 124
23208 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.1 3.0 3608 xyz.com 2017-07-03 NaN 2017-07-03 00:00:24 2017-07-03 http://xyz.com/checkout?Prairie+Potato=1 124
23209 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.1 7.0 2932 example.com 2017-07-03 NaN 2017-07-03 00:00:32 2017-07-03 http://www.example.com/store/?Round+Kumquat=1 123
23210 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 6.0 1529 example.com 2017-07-03 NaN 2017-07-03 07:00:32 2017-07-03 http://www.example.com/store/?Bignay=1 123
23211 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.1 7.0 2525 example.com 2017-07-03 NaN 2017-07-03 00:00:37 2017-07-03 http://www.example.com/store/?Round+Kumquat=1 123

In [183]:
cols = [item for item in item_lists if not "error" in item]
pricing_temp = fr[fr[cols].astype(bool).sum(axis=1) == 1].drop(["error"], axis=1)
pricing_temp.drop_duplicates(subset=cols + ["domain_name", "transaction_date"], inplace=True)
pricing_temp


Out[183]:
Bignay Black/White Pepper European Grape Hazelnut Mabolo Natal Orange Prairie Potato Round Kumquat Ume Ylang-ylang app_version checkout_amount customer_id domain_name file_date placeholder timestamp transaction_date url website_id
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.1 6.0 9418 example.com 2017-07-02 NaN 2017-07-02 07:00:35 2017-07-02 http://www.example.com/store/?Ume=1 123
1 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.1 3.0 3872 xyz.com 2017-07-02 NaN 2017-07-02 07:00:37 2017-07-02 http://xyz.com/checkout?Prairie+Potato=1 124
2 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 4.0 3090 example.com 2017-07-02 NaN 2017-07-02 07:00:47 2017-07-02 http://www.example.com/store/?Hazelnut=1 123
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.1 6.0 9556 xyz.com 2017-07-02 NaN 2017-07-02 00:00:51 2017-07-02 http://xyz.com/checkout?Ume=1 124
4 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 6.0 8845 xyz.com 2017-07-02 NaN 2017-07-02 00:01:02 2017-07-02 http://xyz.com/checkout?Bignay=1 124
5 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 5.0 6008 example.com 2017-07-02 NaN 2017-07-02 00:01:14 2017-07-02 http://www.example.com/store/?Black%2FWhite+Pe... 123
6 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.1 6.0 5312 xyz.com 2017-07-02 NaN 2017-07-02 00:01:18 2017-07-02 http://xyz.com/checkout?Natal+Orange=1 124
9 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.1 3.0 275 example.com 2017-07-02 NaN 2017-07-02 07:01:55 2017-07-02 http://www.example.com/store/?Prairie+Potato=1 123
12 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 6.0 636 example.com 2017-07-02 NaN 2017-07-02 00:02:08 2017-07-02 http://www.example.com/store/?Bignay=1 123
16 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.1 7.0 1953 example.com 2017-07-02 NaN 2017-07-02 07:02:31 2017-07-02 http://www.example.com/store/?Round+Kumquat=1 123
19 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.1 7.0 3660 xyz.com 2017-07-02 NaN 2017-07-02 07:02:53 2017-07-02 http://xyz.com/checkout?Round+Kumquat=1 124
21 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.1 6.0 9246 example.com 2017-07-02 NaN 2017-07-02 00:02:57 2017-07-02 http://www.example.com/store/?Natal+Orange=1 123
27 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 5.0 2619 example.com 2017-07-02 NaN 2017-07-02 00:04:16 2017-07-02 http://www.example.com/store/?European+Grape=1 123
30 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 5.0 4429 xyz.com 2017-07-02 NaN 2017-07-02 07:04:28 2017-07-02 http://xyz.com/checkout?European+Grape=1 124
35 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 4.0 8801 xyz.com 2017-07-02 NaN 2017-07-02 00:04:51 2017-07-02 http://xyz.com/checkout?Hazelnut=1 124
37 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.1 8.0 4802 xyz.com 2017-07-02 NaN 2017-07-02 00:05:24 2017-07-02 http://xyz.com/checkout?Mabolo=1 124
38 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.1 8.0 7316 example.com 2017-07-02 NaN 2017-07-02 00:05:24 2017-07-02 http://www.example.com/store/?Mabolo=1 123
50 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.1 5.0 1563 example.com 2017-07-02 NaN 2017-07-02 07:06:56 2017-07-02 http://www.example.com/store/?Ylang-ylang=1 123
68 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.1 5.0 6717 xyz.com 2017-07-02 NaN 2017-07-02 00:09:56 2017-07-02 http://xyz.com/checkout?Ylang-ylang=1 124
70 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 5.0 9004 xyz.com 2017-07-02 NaN 2017-07-02 07:10:03 2017-07-02 http://xyz.com/checkout?Black%2FWhite+Pepper=1 124
90 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 1.1 12.0 10296 example.com 2017-07-02 NaN 2017-07-02 00:12:41 2017-07-02 http://www.example.com/store/?Natal+Orange=2 123
290 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 1.1 14.0 10855 example.com 2017-07-02 NaN 2017-07-02 00:35:50 2017-07-02 http://www.example.com/store/?Round+Kumquat=2 123
301 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 8.0 10267 xyz.com 2017-07-02 NaN 2017-07-02 00:37:11 2017-07-02 http://xyz.com/checkout?Hazelnut=2 124
318 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 1.1 12.0 10908 xyz.com 2017-07-02 NaN 2017-07-02 00:38:38 2017-07-02 http://xyz.com/checkout?Natal+Orange=2 124
475 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 12.0 10706 xyz.com 2017-07-02 NaN 2017-07-02 07:59:06 2017-07-02 http://xyz.com/checkout?Bignay=2 124
628 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.1 10.0 10322 example.com 2017-07-02 NaN 2017-07-02 01:17:09 2017-07-02 http://www.example.com/store/?Ylang-ylang=2 123
679 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 10.0 10084 xyz.com 2017-07-02 NaN 2017-07-02 01:23:09 2017-07-02 http://xyz.com/checkout?European+Grape=2 124
699 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 1.1 12.0 10751 xyz.com 2017-07-02 NaN 2017-07-02 08:25:57 2017-07-02 http://xyz.com/checkout?Ume=2 124
753 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 1.1 6.0 10873 example.com 2017-07-02 NaN 2017-07-02 08:32:02 2017-07-02 http://www.example.com/store/?Prairie+Potato=2 123
825 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 1.1 14.0 10498 xyz.com 2017-07-02 NaN 2017-07-02 01:39:59 2017-07-02 http://xyz.com/checkout?Round+Kumquat=2 124
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11704 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 12.0 10832 xyz.com 2017-07-01 NaN 2017-07-01 00:19:27 2017-07-01 http://xyz.com/checkout?Bignay=2 124
11715 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 10.0 10043 xyz.com 2017-07-01 NaN 2017-07-01 07:21:20 2017-07-01 http://xyz.com/checkout?European+Grape=2 124
11767 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 10.0 10662 example.com 2017-07-01 NaN 2017-07-01 00:26:17 2017-07-01 http://www.example.com/store/?Black%2FWhite+Pe... 123
12132 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 1.1 6.0 10799 example.com 2017-07-01 NaN 2017-07-01 08:11:08 2017-07-01 http://www.example.com/store/?Prairie+Potato=2 123
12530 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 10.0 10357 example.com 2017-07-01 NaN 2017-07-01 02:03:00 2017-07-01 http://www.example.com/store/?European+Grape=2 123
12609 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 10.0 10164 xyz.com 2017-07-01 NaN 2017-07-01 02:13:11 2017-07-01 http://xyz.com/checkout?Black%2FWhite+Pepper=2 124
12673 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.1 10.0 10184 example.com 2017-07-01 NaN 2017-07-01 02:21:58 2017-07-01 http://www.example.com/store/?Ylang-ylang=2 123
12913 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 1.1 16.0 10324 example.com 2017-07-01 NaN 2017-07-01 02:52:32 2017-07-01 http://www.example.com/store/?Mabolo=2 123
12975 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 8.0 10317 example.com 2017-07-01 NaN 2017-07-01 10:00:37 2017-07-01 http://www.example.com/store/?Hazelnut=2 123
12978 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 1.1 14.0 10359 example.com 2017-07-01 NaN 2017-07-01 03:00:49 2017-07-01 http://www.example.com/store/?Round+Kumquat=2 123
13014 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 1.1 6.0 10651 xyz.com 2017-07-01 NaN 2017-07-01 03:06:41 2017-07-01 http://xyz.com/checkout?Prairie+Potato=2 124
13428 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 1.1 12.0 10207 example.com 2017-07-01 NaN 2017-07-01 03:56:27 2017-07-01 http://www.example.com/store/?Ume=2 123
13459 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 12.0 10039 example.com 2017-07-01 NaN 2017-07-01 04:00:26 2017-07-01 http://www.example.com/store/?Bignay=2 123
13537 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.1 10.0 10920 xyz.com 2017-07-01 NaN 2017-07-01 04:08:41 2017-07-01 http://xyz.com/checkout?Ylang-ylang=2 124
13561 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 1.1 14.0 10489 xyz.com 2017-07-01 NaN 2017-07-01 11:12:38 2017-07-01 http://xyz.com/checkout?Round+Kumquat=2 124
13995 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 1.1 12.0 10904 xyz.com 2017-07-01 NaN 2017-07-01 12:07:08 2017-07-01 http://xyz.com/checkout?Ume=2 124
14316 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 1.1 16.0 10364 xyz.com 2017-07-01 NaN 2017-07-01 05:47:39 2017-07-01 http://xyz.com/checkout?Mabolo=2 124
14607 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 1.1 12.0 10324 example.com 2017-07-01 NaN 2017-07-01 13:24:40 2017-07-01 http://www.example.com/store/?Natal+Orange=2 123
17349 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 8.0 10117 xyz.com 2017-07-01 NaN 2017-07-01 11:55:22 2017-07-01 http://xyz.com/checkout?Hazelnut=2 124
23369 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 10.0 10936 example.com 2017-07-03 NaN 2017-07-03 07:19:15 2017-07-03 http://www.example.com/store/?Black%2FWhite+Pe... 123
23852 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 8.0 10033 xyz.com 2017-07-03 NaN 2017-07-03 08:17:39 2017-07-03 http://xyz.com/checkout?Hazelnut=2 124
23873 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 12.0 10009 example.com 2017-07-03 NaN 2017-07-03 01:19:20 2017-07-03 http://www.example.com/store/?Bignay=2 123
23878 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.1 -12.0 10113 xyz.com 2017-07-03 NaN 2017-07-03 08:20:25 2017-07-03 http://xyz.com/checkout?Bignay=2 124
24209 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 1.1 6.0 10698 xyz.com 2017-07-03 NaN 2017-07-03 08:58:33 2017-07-03 http://xyz.com/checkout?Prairie+Potato=2 124
24279 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 1.1 6.0 10964 example.com 2017-07-03 NaN 2017-07-03 02:05:54 2017-07-03 http://www.example.com/store/?Prairie+Potato=2 123
25534 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 1.2 -12.0 10098 xyz.com 2017-07-03 new_version 2017-07-03 11:44:53 2017-07-03 http://xyz.com/checkout?Natal+Orange=2 124
26111 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 1.2 16.0 10889 xyz.com 2017-07-03 new_version 2017-07-03 05:57:41 2017-07-03 http://xyz.com/checkout?Mabolo=2 124
26435 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 1.2 12.0 10806 example.com 2017-07-03 new_version 2017-07-03 06:37:45 2017-07-03 http://store.example.com/?Ume=2 123
26874 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.2 10.0 10278 example.com 2017-07-03 new_version 2017-07-03 14:33:04 2017-07-03 http://store.example.com/?Ylang-ylang=2 123
28560 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 1.2 12.0 10225 example.com 2017-07-03 new_version 2017-07-03 18:02:53 2017-07-03 http://store.example.com/?Natal+Orange=2 123

120 rows × 20 columns


In [180]:
pricing_temp.transpose()


Out[180]:
0 1 2 3 4 5 6 9 12 16 ... 23852 23873 23878 24209 24279 25534 26111 26435 26874 28560
Bignay 0 0 0 0 1 0 0 0 1 0 ... 0 2 2 0 0 0 0 0 0 0
Black/White Pepper 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
European Grape 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Hazelnut 0 0 1 0 0 0 0 0 0 0 ... 2 0 0 0 0 0 0 0 0 0
Mabolo 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 2 0 0 0
Natal Orange 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 2 0 0 0 2
Prairie Potato 0 1 0 0 0 0 0 1 0 0 ... 0 0 0 2 2 0 0 0 0 0
Round Kumquat 0 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
Ume 1 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 2 0 0
Ylang-ylang 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 2 0
app_version 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ... 1.1 1.1 1.1 1.1 1.1 1.2 1.2 1.2 1.2 1.2
checkout_amount 6 3 4 6 6 5 6 3 6 7 ... 8 12 -12 6 6 -12 16 12 10 12
customer_id 9418 3872 3090 9556 8845 6008 5312 275 636 1953 ... 10033 10009 10113 10698 10964 10098 10889 10806 10278 10225
domain_name example.com xyz.com example.com xyz.com xyz.com example.com xyz.com example.com example.com example.com ... xyz.com example.com xyz.com xyz.com example.com xyz.com xyz.com example.com example.com example.com
file_date 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 ... 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03
placeholder NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN new_version new_version new_version new_version new_version
timestamp 2017-07-02 07:00:35 2017-07-02 07:00:37 2017-07-02 07:00:47 2017-07-02 00:00:51 2017-07-02 00:01:02 2017-07-02 00:01:14 2017-07-02 00:01:18 2017-07-02 07:01:55 2017-07-02 00:02:08 2017-07-02 07:02:31 ... 2017-07-03 08:17:39 2017-07-03 01:19:20 2017-07-03 08:20:25 2017-07-03 08:58:33 2017-07-03 02:05:54 2017-07-03 11:44:53 2017-07-03 05:57:41 2017-07-03 06:37:45 2017-07-03 14:33:04 2017-07-03 18:02:53
transaction_date 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 2017-07-02 ... 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03
url http://www.example.com/store/?Ume=1 http://xyz.com/checkout?Prairie+Potato=1 http://www.example.com/store/?Hazelnut=1 http://xyz.com/checkout?Ume=1 http://xyz.com/checkout?Bignay=1 http://www.example.com/store/?Black%2FWhite+Pe... http://xyz.com/checkout?Natal+Orange=1 http://www.example.com/store/?Prairie+Potato=1 http://www.example.com/store/?Bignay=1 http://www.example.com/store/?Round+Kumquat=1 ... http://xyz.com/checkout?Hazelnut=2 http://www.example.com/store/?Bignay=2 http://xyz.com/checkout?Bignay=2 http://xyz.com/checkout?Prairie+Potato=2 http://www.example.com/store/?Prairie+Potato=2 http://xyz.com/checkout?Natal+Orange=2 http://xyz.com/checkout?Mabolo=2 http://store.example.com/?Ume=2 http://store.example.com/?Ylang-ylang=2 http://store.example.com/?Natal+Orange=2
website_id 123 124 123 124 124 123 124 123 123 123 ... 124 123 124 124 123 124 124 123 123 123

20 rows × 120 columns


In [191]:
# data structure
# - transaction_timestamp
# - domain name
# - item
# - price
price_cols = []
for col in cols:
    price_cols.append(pricing_temp["checkout_amount"]/pricing_temp[col])
pricing = pd.concat(price_cols, axis=1)
pricing.columns = cols
price_cols = [col + "_price" for col in cols]
dfr = pricing_temp.join(pricing, rsuffix="_price")[price_cols + ["transaction_date", "domain_name"]]
dfr = dfr.replace([np.inf, -np.inf], np.nan).fillna(value=0)
pd.pivot_table(dfr, values=price_cols, index="transaction_date", columns="domain_name", aggfunc=np.max).transpose()
# pd.pivot_table(dfr, values=price_cols, index=["transaction_date"], columns=["domain_name"])
# dfr = dfr.replace([np.inf, -np.inf], np.nan).fillna(value=0)
# dfrt = dfr.transpose()
# t_new_cols = ["col_" + str(col) for col in dfrt.columns.tolist()]
# dfrt.columns = t_new_cols
# dfrt = dfrt.reset_index(drop=False).rename(columns={"index": "temp"})
# dfrt.transpose()
# pd.pivot_table(dfrt, columns=["transaction_date", "domain_name"], values=t_new_cols, index="temp")
# dfr["0"]
# px_cols = [col for col in dfr.columns if not "temp" in col]
# px_cols
# pd.pivot_table(dfr, columns=index, index=["transaction_date", "domain_nam"], values=px_cols, aggfunc=np.max)
# pricing["checkout_amount"].div(pricing[cols], axis="index")
# pricing["checkout_amount"]  #.div(pricing[cols], axis=0)
# there are now values > 1 for the items, so we replace them with 1
# pricing[cols] > 1
# pricing[pricing[cols] > 1]
# pricing.transpose()


Out[191]:
transaction_date 2017-07-01 2017-07-02 2017-07-03
domain_name
Bignay_price example.com 6.0 6.0 6.0
xyz.com 6.0 6.0 6.0
Black/White Pepper_price example.com 5.0 5.0 5.0
xyz.com 5.0 5.0 5.0
European Grape_price example.com 5.0 5.0 5.0
xyz.com 5.0 5.0 5.0
Hazelnut_price example.com 4.0 4.0 4.0
xyz.com 4.0 4.0 4.0
Mabolo_price example.com 8.0 8.0 8.0
xyz.com 8.0 8.0 8.0
Natal Orange_price example.com 6.0 6.0 6.0
xyz.com 6.0 6.0 6.0
Prairie Potato_price example.com 3.0 3.0 3.0
xyz.com 3.0 3.0 3.0
Round Kumquat_price example.com 7.0 7.0 7.0
xyz.com 7.0 7.0 7.0
Ume_price example.com 6.0 6.0 6.0
xyz.com 6.0 6.0 6.0
Ylang-ylang_price example.com 5.0 5.0 5.0
xyz.com 5.0 5.0 5.0

In [74]:
dd = {}
for col in item_lists:
    if not "error" in col:
        dd[col + "_price"] = np.max(pricing[col] * pricing["checkout_amount"])
dd


Out[74]:
{'Bignay_price': 24.0,
 'Black/White Pepper_price': 20.0,
 'European Grape_price': 20.0,
 'Hazelnut_price': 16.0,
 'Mabolo_price': 32.0,
 'Natal Orange_price': 24.0,
 'Prairie Potato_price': 12.0,
 'Round Kumquat_price': 28.0,
 'Ume_price': 24.0,
 'Ylang-ylang_price': 20.0}

In [30]:
pd.pivot_table(pricing, values="checkout_amount", index=, columns="transaction_date")


Out[30]:
transaction_date 2017-07-03
domain_name
example.com 5.583508
xyz.com 5.542160

In [38]:
pricing.transpose()


Out[38]:
23207 23208 23209 23210 23211 23212 23213 23214 23215 23217 ... 33176 33177 33178 33179 33180 33181 33182 33183 33185 33186
Bignay 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Black/White Pepper 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
European Grape 0 0 0 0 0 1 1 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
Hazelnut 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
Mabolo 0 0 0 0 0 0 0 0 0 0 ... 0 1 1 0 0 0 0 0 0 0
Natal Orange 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Prairie Potato 0 1 0 0 0 0 0 1 0 1 ... 0 0 0 0 0 0 0 0 1 0
Round Kumquat 0 0 1 0 1 0 0 0 0 0 ... 1 0 0 0 0 0 1 0 0 0
Ume 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 1
Ylang-ylang 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 1 0 0 0 0 0
app_version 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ... 1.2 1.2 1.2 1.2 1.2 1.2 1.2 1.2 1.2 1.2
checkout_amount 5 3 7 6 7 5 5 3 4 3 ... 7 8 8 5 5 5 7 6 3 6
customer_id 4877 3608 2932 1529 2525 1545 3821 946 3918 7835 ... 6036 6448 6817 566 3674 6560 3712 3158 7730 8204
domain_name xyz.com xyz.com example.com example.com example.com example.com example.com xyz.com example.com example.com ... example.com xyz.com xyz.com example.com xyz.com example.com example.com xyz.com xyz.com xyz.com
error 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
file_date 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 ... 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03
placeholder NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... new_version new_version new_version new_version new_version new_version new_version new_version new_version new_version
timestamp 2017-07-03 07:00:12 2017-07-03 00:00:24 2017-07-03 00:00:32 2017-07-03 07:00:32 2017-07-03 00:00:37 2017-07-03 07:00:41 2017-07-03 00:00:49 2017-07-03 00:00:49 2017-07-03 07:00:59 2017-07-03 07:01:12 ... 2017-07-03 23:57:53 2017-07-03 23:58:04 2017-07-03 23:58:06 2017-07-03 23:58:15 2017-07-03 23:58:17 2017-07-03 23:58:30 2017-07-03 23:58:38 2017-07-03 23:59:10 2017-07-03 23:59:32 2017-07-03 23:59:34
transaction_date 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 ... 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03 2017-07-03
url http://xyz.com/checkout?Ylang-ylang=1 http://xyz.com/checkout?Prairie+Potato=1 http://www.example.com/store/?Round+Kumquat=1 http://www.example.com/store/?Bignay=1 http://www.example.com/store/?Round+Kumquat=1 http://www.example.com/store/?European+Grape=1 http://www.example.com/store/?European+Grape=1 http://xyz.com/checkout?Prairie+Potato=1 http://www.example.com/store/?Hazelnut=1 http://www.example.com/store/?Prairie+Potato=1 ... http://store.example.com/?Round+Kumquat=1 http://xyz.com/checkout?Mabolo=1 http://xyz.com/checkout?Mabolo=1 http://store.example.com/?Ylang-ylang=1 http://xyz.com/checkout?Ylang-ylang=1 http://store.example.com/?European+Grape=1 http://store.example.com/?Round+Kumquat=1 http://xyz.com/checkout?Ume=1 http://xyz.com/checkout?Prairie+Potato=1 http://xyz.com/checkout?Ume=1
website_id 124 124 123 123 123 123 123 124 123 123 ... 123 124 124 123 124 123 123 124 124 124

21 rows × 8693 columns


In [18]:
pd.pivot_table(fr, values="checkout_amount", index="transaction_date", columns="domain_name",
               aggfunc=[np.sum], margins=True)

In [19]:



Out[19]:
a b c d
0 1 0 0 0
1 0 2 0 0
2 0 0 0 0
3 3 0 0 1
4 0 0 6 0

In [20]:
# print(df.loc[df['B'].isin(['one','three'])])
for col in dfr.columns.tolist():
    dfr[col]


a
b
c
d

In [21]:
two = pd.DataFrame({'one' : pd.Series([10, 0, 10, 9], index=['a', 'b', 'c', 'd']), 'two' : pd.Series([0, 0, 10., 4.6], index=['a', 'b', 'c', 'd']), 'three' : pd.Series([5, -1, 7, -1], index=['a', 'b', 'c', 'd'])})

In [22]:
two


Out[22]:
one three two
a 10 5 0.0
b 0 -1 0.0
c 10 7 10.0
d 9 -1 4.6

In [26]:
dfr[dfr.astype(bool).sum(axis=1) == 1]


Out[26]:
a b c d
0 1 0 0 0
1 0 2 0 0
4 0 0 6 0

In [25]:
gr = fr[item_lists + ["transaction_date", "domain_name", "checkout_amount"]]

In [26]:
frame = fr[item_lists + ["transaction_date", "domain_name", "checkout_amount"]]
gr = frame.groupby(["transaction_date", "domain_name"])

In [33]:
gb_frames = []
for name, group in gr:
    gb_frames.append({"name": name, "frame": group})

In [35]:
gb_frames[0]["name"][0]


Out[35]:
'2017-07-01'

In [37]:
corr = gb_frames[0]["frame"][item_lists].corr()
ax = sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values, linewidths=.5,
                 cmap="YlGnBu")
ax = plt.title("Test")
plt.tight_layout()



In [ ]: