In [1]:

    
import os
import os.path

from urllib.parse import urlparse
from urllib.parse import parse_qs
from datetime import datetime

%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import boto3
import numpy as np
import pandas as pd

Data Challenge

Files are stored in an S3 bucket. The purpose here is to work out methods to process the data properly. There is a lot of scratch work that is not explained, but there are some observations about the data below.

Bucket Contents and File Download



In [2]:

    
s3 = boto3.resource('s3')
bucket_name = "postie-testing-assets"



In [3]:

    
test = s3.Bucket(bucket_name)



In [4]:

    
s3.meta.client.head_bucket(Bucket=bucket_name)









    Out[4]:





{'ResponseMetadata': {'HTTPHeaders': {'content-type': 'application/xml',
   'date': 'Wed, 18 Oct 2017 14:16:20 GMT',
   'server': 'AmazonS3',
   'transfer-encoding': 'chunked',
   'x-amz-bucket-region': 'us-east-1',
   'x-amz-id-2': 'r0MvQsrnNPv2cYn5roGk88QwMSU4hFswFDE28rPwcAYH73jx3Mb1uR7rNVrDQ1OOgRGN8AeIQWQ=',
   'x-amz-request-id': '6A90346B37412C6E'},
  'HTTPStatusCode': 200,
  'HostId': 'r0MvQsrnNPv2cYn5roGk88QwMSU4hFswFDE28rPwcAYH73jx3Mb1uR7rNVrDQ1OOgRGN8AeIQWQ=',
  'RequestId': '6A90346B37412C6E',
  'RetryAttempts': 0}}



In [5]:

    
for key in test.objects.all():
    print(key.key)









    



2017-07-01.csv
2017-07-02.csv
2017-07-03.csv

An alternative is to download the data using

aws s3 cp --recursive s3://my_bucket_name local_folder

Observations

Analyst is correct - July 3rd file has sales of $164,065. There are fewer sales and customers on the 3rd than the other two days, so the sales amount is lower. However, there are problems with the data files, so there are sales recorded on the 3rd found in files labeled for the 2nd and 3rd. There are sales recorded on the 2nd found in the files labeled for the 1st and 2nd. To get accurate sales data we will have to group by transaction date - the sales date parsed from the timestamp field.
The placeholder field is empty for two files, so it's probably not meaningful. Regardless, it will not add anything to prediction.
app_version may or may not be meaningful. There is only one app_version for the files labeled "2017-07-01" and "2017-07-02". The remaining file has two app_versions - 1.1 and 1.2. This field is probably not meaningful for prediction, but sales on the 3rd should be examined for the different app_versions.

Plans

Get the items. But should we make new columns in the original dataset?
Items are paired with item counts.
We would like to know the sales by domain and by item.
We would like to know the corrected sales by date.
Average sales is not meaningful - explain why.
Sales prediction at the aggregate level is not meaningful - there are only two values per domain.
So do we predict sales at the item level? If so, how?
What do we do with the app_version?
What are the values in placeholder?
Explain why average sales for a day is not meaningful.



In [2]:

    
def convert_list(query_string):
    """Parse the query string of the url into a dictionary.
    
    Handle special cases:
    - There is a single query "error=True" which is rewritten to 1 if True, else 0.
    - Parsing the query returns a dictionary of key-value pairs. The value is a list.
      We must get the list value as an int.
      
    Note: This function may be a bottleneck when processing larger data files.
    """
    def handle_error(z, col):
        """
        Called in the dictionary comprehension below to handle the "error" key.
        """
        if "error" in col:
            return 1 if "True" in z else 0
        return z
    dd = parse_qs(query_string)
    return {k: int(handle_error(dd[k][0], k)) for k in dd}



In [23]:

    
fr = pd.DataFrame()
col_names = ["timestamp", "website_id", "customer_id", "app_version", "placeholder", "checkout_amount", "url"]
data_report = []
item_lists = []
for fname in os.listdir("data"):
    ffr = pd.read_csv(os.path.join("data", fname),
                      header=0, names=col_names,
                      infer_datetime_format=True, parse_dates=[0])
    file_date = fname.split(".")[0]
    ffr["file_date"] = file_date
    transaction_date = ffr.timestamp.apply(lambda x: x.strftime('%Y-%m-%d'))  # reformat transaction timestamp
    ffr["transaction_date"] = transaction_date
    url_items = ffr.url.apply(lambda x: urlparse(x))
    domain_name = url_items.apply(lambda x: x[1])
    item_query = url_items.apply(lambda x: x[4])
    qq = item_query.apply(lambda x: convert_list)
    # handle store.example.com and www.example.com as the same website
    ffr["domain_name"] = domain_name.apply(lambda x: x if not "example.com" in x else ".".join(x.split(".")[1:]))
    item_query = url_items.apply(lambda x: x[4])
    qq = item_query.apply(lambda x: convert_list(x)).apply(pd.Series).fillna(value=0)
    item_lists += qq.columns.tolist()
    final_fr = ffr.join(qq)
    print("date {} has {} sales for rows {} and unique dates {}".format(fname, ffr.checkout_amount.sum(),
                                                                        ffr.shape[0],
                                                                        transaction_date.unique().shape[0]))
    data_report.append({"file_date": file_date, "sales": ffr.checkout_amount.sum(),
                        "n_placeholder_nan": sum(ffr.placeholder.isnull()),
                        "n_rows": ffr.shape[0],
                        "n_websites": ffr.website_id.unique().shape[0],
                        "n_customers": ffr.customer_id.unique().shape[0],
                        "n_app_versions": ffr.app_version.unique().shape[0],
                        "n_dates": transaction_date.unique().shape[0]})
    fr = fr.append(final_fr)
fr.reset_index(drop=True, inplace=True)
item_lists = list(set([item for item in item_lists if not "error" in item]))
fr.shape









    



date 2017-07-02.csv has 183294.0 sales for rows 11573 and unique dates 2
date 2017-07-01.csv has 241491.0 sales for rows 11634 and unique dates 2
date 2017-07-03.csv has 164065.0 sales for rows 9981 and unique dates 1






    Out[23]:





(33188, 21)



In [8]:

    
fr.head()









    Out[8]:







  
    
      
      Bignay
      Black/White Pepper
      European Grape
      Hazelnut
      Mabolo
      Natal Orange
      Prairie Potato
      Round Kumquat
      Ume
      Ylang-ylang
      ...
      checkout_amount
      customer_id
      domain_name
      error
      file_date
      placeholder
      timestamp
      transaction_date
      url
      website_id
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      ...
      6.0
      9418
      example.com
      NaN
      2017-07-02
      NaN
      2017-07-02 07:00:35
      2017-07-02
      http://www.example.com/store/?Ume=1
      123
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      ...
      3.0
      3872
      xyz.com
      NaN
      2017-07-02
      NaN
      2017-07-02 07:00:37
      2017-07-02
      http://xyz.com/checkout?Prairie+Potato=1
      124
    
    
      2
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      4.0
      3090
      example.com
      NaN
      2017-07-02
      NaN
      2017-07-02 07:00:47
      2017-07-02
      http://www.example.com/store/?Hazelnut=1
      123
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      ...
      6.0
      9556
      xyz.com
      NaN
      2017-07-02
      NaN
      2017-07-02 00:00:51
      2017-07-02
      http://xyz.com/checkout?Ume=1
      124
    
    
      4
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      6.0
      8845
      xyz.com
      NaN
      2017-07-02
      NaN
      2017-07-02 00:01:02
      2017-07-02
      http://xyz.com/checkout?Bignay=1
      124
    
  

5 rows × 21 columns



In [12]:

    
sns.boxplot(x="transaction_date", y="checkout_amount", data=fr)









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f99bcef21d0>



In [317]:

    
pt = pd.pivot_table(fr, values="checkout_amount", index="transaction_date", columns="domain_name",
               aggfunc=[np.sum], margins=False)









    Out[317]:







  
    
      
      sum
    
    
      domain_name
      example.com
      xyz.com
    
    
      transaction_date
      
      
    
  
  
    
      2017-07-01
      172874.0
      50641.0
    
    
      2017-07-02
      127786.0
      55966.0
    
    
      2017-07-03
      64599.0
      116984.0



In [9]:

    
from scipy.stats import linregress
pt = pd.pivot_table(fr, values="checkout_amount", index="transaction_date", columns="domain_name",
               aggfunc=[np.sum], margins=False)
pt.columns = ['example.com', 'xyz.com']
pt.index = pd.DatetimeIndex(pt.index)
idx = pd.date_range(pt.index.min(), pt.index.max())
pt = pt.reindex(index=idx)
pt.insert(pt.shape[1],
          'row_count',
          pt.index.value_counts().sort_index().cumsum())
slope, intercept, r_value, p_value, std_err = linregress(x=pt["row_count"].values,
                                                         y=pt["example.com"].values)
pt["example.com regression line"] = intercept + slope * pt["row_count"]
pt[["example.com", "example.com regression line"]].plot()
# ax = sns.regplot(data=pt, x="row_count", y="example.com")
# xticks = ax.get_xticks()
# labels = ['' for item in ax.get_xticklabels()]
# labels[1] = pt.index[0].date()
# labels[5] = pt.index[1].date()
# labels[9] = pt.index[2].date()
# ax.set_xticklabels(labels)
# plt.xticks(rotation=40)
# plt.xlabel("Transaction Date")
# plt.show()
# pt

import seaborn as sns
corr = dataframe.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x7ff944d2e048>



In [344]:

    
slope, intercept, r_value, p_value, std_err = linregress(x=ax.get_lines()[0].get_xdata(),y=ax.get_lines()[0].get_ydata())
print("Predicted sales 7/4/2014 is {}".format(intercept + slope * 4))









    



Predicted sales 7/4/2014 is 13478.000000000204



In [375]:

    
ax.get_xticks()









    Out[375]:





array([ 0.75,  1.  ,  1.25,  1.5 ,  1.75,  2.  ,  2.25,  2.5 ,  2.75,
        3.  ,  3.25])



In [376]:

    
pt.row_count









    Out[376]:





2017-07-01    1
2017-07-02    2
2017-07-03    3
Freq: D, Name: row_count, dtype: int64



In [ ]:



In [18]:

    
def convert_list(x):
    def handle_error(z, col):
        if "error" in col:
            return 1 if "True" in z else 0
        else:
            return z
    dd = parse_qs(x)
    return {k: int(handle_error(dd[k][0], k)) for k in dd}



In [35]:

    
url_results = fr.url.apply(lambda x: urlparse(x))
domain_names = url_results.apply(lambda x: x[1])
item_query = url_results.apply(lambda x: x[4])
qq = item_query.apply(lambda x: convert_list(x))
qq = qq.apply(pd.Series).fillna(value=0)



In [ ]:



In [ ]:



In [33]:

    
dd = parse_qs(item_query[20])
ddd = {k: int(dd[k][0]) for k in dd}
ddd
# for k in dd:
#     dd[k] = int(dd[k][0])









    Out[33]:





{'Round Kumquat': 1, 'Ume': 1}



In [36]:

    
qq.head()









    Out[36]:







  
    
      
      Bignay
      Black/White Pepper
      European Grape
      Hazelnut
      Mabolo
      Natal Orange
      Prairie Potato
      Round Kumquat
      Ume
      Ylang-ylang
      error
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
    
    
      4
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0



In [95]:

    
for it in item_query:
    if "True" in it:
        print(it)









    



Bignay=1&error=True



In [11]:

    
url_results.shape









    Out[11]:





(33188,)



In [116]:

    
url_results = pd.concat([fr.timestamp, fr.customer_id, qq], axis=1)



In [117]:

    
url_results.head(10)









    Out[117]:







  
    
      
      timestamp
      customer_id
      Bignay
      Black/White Pepper
      European Grape
      Hazelnut
      Mabolo
      Natal Orange
      Prairie Potato
      Round Kumquat
      Ume
      Ylang-ylang
      error
    
  
  
    
      0
      2017-07-02 07:00:35
      9418
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
    
    
      1
      2017-07-02 07:00:37
      3872
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      2017-07-02 07:00:47
      3090
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      2017-07-02 00:00:51
      9556
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
    
    
      4
      2017-07-02 00:01:02
      8845
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      5
      2017-07-02 00:01:14
      6008
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      6
      2017-07-02 00:01:18
      5312
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      7
      2017-07-02 07:01:33
      9178
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      8
      2017-07-02 07:01:55
      1680
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
    
    
      9
      2017-07-02 07:01:55
      275
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0



In [27]:

    
int(url_results.url[0]["Ume"][0])









    Out[27]:





1



In [30]:

    
url_results["url"].apply(pd.Series) #.apply(lambda x: int(0 if np.isnan(x[0]) else x[0])) #.fillna(value=0) #.apply(lambda x: int(x[0]))









    Out[30]:







  
    
      
      Bignay
      Black/White Pepper
      European Grape
      Hazelnut
      Mabolo
      Natal Orange
      Prairie Potato
      Round Kumquat
      Ume
      Ylang-ylang
      error
    
  
  
    
      0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
    
    
      1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
    
    
      2
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
    
    
      4
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      5
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      6
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      7
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      8
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
    
    
      9
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
    
    
      10
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
    
    
      11
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      12
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      13
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      14
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
    
    
      15
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      16
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
    
    
      17
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      18
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
    
    
      19
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
    
    
      20
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      [1]
      NaN
      NaN
    
    
      21
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      22
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
    
    
      23
      NaN
      [1]
      [1]
      [3]
      [2]
      [1]
      [1]
      [1]
      NaN
      NaN
      NaN
    
    
      24
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
    
    
      25
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
    
    
      26
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
    
    
      27
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      28
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      29
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      33158
      [1]
      [1]
      [1]
      NaN
      [1]
      [1]
      [1]
      [2]
      NaN
      [2]
      NaN
    
    
      33159
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33160
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
    
    
      33161
      [1]
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33162
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33163
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33164
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
    
    
      33165
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33166
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33167
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      [1]
      NaN
      NaN
    
    
      33168
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
    
    
      33169
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
    
    
      33170
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33171
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33172
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
    
    
      33173
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33174
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
    
    
      33175
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33176
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
    
    
      33177
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33178
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33179
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
    
    
      33180
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
    
    
      33181
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      33182
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
    
    
      33183
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
    
    
      33184
      [2]
      [2]
      NaN
      [2]
      NaN
      NaN
      NaN
      [3]
      [1]
      NaN
      NaN
    
    
      33185
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      NaN
    
    
      33186
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
    
    
      33187
      NaN
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
      NaN
      [1]
      NaN
      NaN
    
  

33188 rows × 11 columns



In [31]:

    
new_cols = url_results["url"].apply(pd.Series).fillna(value=0)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

TypeError: an integer is required

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-31-88cd69044e19> in <module>()
----> 1 new_cols = url_results["url"].apply(pd.Series).fillna(value=0)

~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in __getitem__(self, key)
    599         key = com._apply_if_callable(key, self)
    600         try:
--> 601             result = self.index.get_value(self, key)
    602 
    603             if not is_scalar(result):

~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_value(self, series, key)
   2475         try:
   2476             return self._engine.get_value(s, k,
-> 2477                                           tz=getattr(series.dtype, 'tz', None))
   2478         except KeyError as e1:
   2479             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

KeyError: 'url'



In [56]:









    Out[56]:







  
    
      
      Bignay
      Black/White Pepper
      European Grape
      Hazelnut
      Mabolo
      Natal Orange
      Prairie Potato
      Round Kumquat
      Ume
      Ylang-ylang
      error
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      [1]
      0
      0
    
    
      1
      0
      0
      0
      0
      0
      0
      [1]
      0
      0
      0
      0
    
    
      2
      0
      0
      0
      [1]
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      0
      [1]
      0
      0
    
    
      4
      [1]
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0



In [49]:

    
new_cols.shape









    Out[49]:





(33188, 11)



In [57]:

    
for col in new_cols.columns:
    print(new_cols[col].unique())









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-57-6c31b99a16ce> in <module>()
      1 for col in new_cols.columns:
----> 2     print(new_cols[col].unique())

~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in unique(self)
   1192     @Appender(base._shared_docs['unique'] % _shared_doc_kwargs)
   1193     def unique(self):
-> 1194         result = super(Series, self).unique()
   1195 
   1196         if is_datetime64tz_dtype(self.dtype):

~/anaconda3/lib/python3.6/site-packages/pandas/core/base.py in unique(self)
    970         else:
    971             from pandas.core.algorithms import unique1d
--> 972             result = unique1d(values)
    973 
    974         return result

~/anaconda3/lib/python3.6/site-packages/pandas/core/algorithms.py in unique(values)
    354 
    355     table = htable(len(values))
--> 356     uniques = table.unique(values)
    357     uniques = _reconstruct_data(uniques, dtype, original)
    358 

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.unique()

TypeError: unhashable type: 'list'



In [44]:

    
int(new_cols["Bignay"][4][0])









    Out[44]:





1



In [53]:

    
def convert_to_num(x):
    if isinstance(x, list):
        try:
            xx = int(x[0])
        except Exception as ex:
            print(x[0])
        return int(x[0])
    else:
        return x



In [54]:

    
frames = [new_cols[col].apply(convert_to_num) for col in new_cols.columns]









    



True






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-54-b6750dee2c60> in <module>()
----> 1 frames = [new_cols[col].apply(convert_to_num) for col in new_cols.columns]

<ipython-input-54-b6750dee2c60> in <listcomp>(.0)
----> 1 frames = [new_cols[col].apply(convert_to_num) for col in new_cols.columns]

~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
   2353             else:
   2354                 values = self.asobject
-> 2355                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   2356 
   2357         if len(mapped) and isinstance(mapped[0], Series):

pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()

<ipython-input-53-cfa6290d7d71> in convert_to_num(x)
      5         except Exception as ex:
      6             print(x[0])
----> 7         return int(x[0])
      8     else:
      9         return x

ValueError: invalid literal for int() with base 10: 'True'



In [124]:

    
pr = urlparse("http://xyz.com/checkout?Ume=1")



In [127]:

    
pr









    Out[127]:





ParseResult(scheme='http', netloc='xyz.com', path='/checkout', params='', query='Ume=1', fragment='')



In [37]:

    
fr.head()









    Out[37]:







  
    
      
      Bignay
      Black/White Pepper
      European Grape
      Hazelnut
      Mabolo
      Natal Orange
      Prairie Potato
      Round Kumquat
      Ume
      Ylang-ylang
      ...
      checkout_amount
      customer_id
      domain_name
      error
      file_date
      placeholder
      timestamp
      transaction_date
      url
      website_id
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      ...
      6.0
      9418
      example.com
      NaN
      2017-07-02
      NaN
      2017-07-02 07:00:35
      2017-07-02
      http://www.example.com/store/?Ume=1
      123
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      ...
      3.0
      3872
      xyz.com
      NaN
      2017-07-02
      NaN
      2017-07-02 07:00:37
      2017-07-02
      http://xyz.com/checkout?Prairie+Potato=1
      124
    
    
      2
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      4.0
      3090
      example.com
      NaN
      2017-07-02
      NaN
      2017-07-02 07:00:47
      2017-07-02
      http://www.example.com/store/?Hazelnut=1
      123
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      ...
      6.0
      9556
      xyz.com
      NaN
      2017-07-02
      NaN
      2017-07-02 00:00:51
      2017-07-02
      http://xyz.com/checkout?Ume=1
      124
    
    
      4
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      6.0
      8845
      xyz.com
      NaN
      2017-07-02
      NaN
      2017-07-02 00:01:02
      2017-07-02
      http://xyz.com/checkout?Bignay=1
      124
    
  

5 rows × 21 columns



In [30]:

    
item_cols = new_cols.columns.tolist()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-30-02c2ae1757b7> in <module>()
----> 1 item_cols = new_cols.columns.tolist()

NameError: name 'new_cols' is not defined



In [13]:

    
fr[item_lists + ['checkout_amount', 'timestamp', 'domain_name', 'transaction_date', 'customer_id']]









    Out[13]:







  
    
      
      Bignay
      Hazelnut
      Ylang-ylang
      Mabolo
      Ume
      Natal Orange
      Prairie Potato
      Black/White Pepper
      Round Kumquat
      error
      European Grape
      checkout_amount
      timestamp
      domain_name
      transaction_date
      customer_id
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 07:00:35
      example.com
      2017-07-02
      9418
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      NaN
      0.0
      3.0
      2017-07-02 07:00:37
      xyz.com
      2017-07-02
      3872
    
    
      2
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      4.0
      2017-07-02 07:00:47
      example.com
      2017-07-02
      3090
    
    
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 00:00:51
      xyz.com
      2017-07-02
      9556
    
    
      4
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 00:01:02
      xyz.com
      2017-07-02
      8845
    
    
      5
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      NaN
      0.0
      5.0
      2017-07-02 00:01:14
      example.com
      2017-07-02
      6008
    
    
      6
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 00:01:18
      xyz.com
      2017-07-02
      5312
    
    
      7
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 07:01:33
      xyz.com
      2017-07-02
      9178
    
    
      8
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 07:01:55
      xyz.com
      2017-07-02
      1680
    
    
      9
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      NaN
      0.0
      3.0
      2017-07-02 07:01:55
      example.com
      2017-07-02
      275
    
    
      10
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      NaN
      0.0
      3.0
      2017-07-02 07:02:02
      example.com
      2017-07-02
      1073
    
    
      11
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 07:02:02
      xyz.com
      2017-07-02
      7222
    
    
      12
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 00:02:08
      example.com
      2017-07-02
      636
    
    
      13
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 00:02:19
      example.com
      2017-07-02
      6854
    
    
      14
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      NaN
      0.0
      3.0
      2017-07-02 07:02:26
      xyz.com
      2017-07-02
      440
    
    
      15
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 00:02:29
      xyz.com
      2017-07-02
      6542
    
    
      16
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      NaN
      0.0
      7.0
      2017-07-02 07:02:31
      example.com
      2017-07-02
      1953
    
    
      17
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 07:02:34
      xyz.com
      2017-07-02
      1440
    
    
      18
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 07:02:41
      xyz.com
      2017-07-02
      9350
    
    
      19
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      NaN
      0.0
      7.0
      2017-07-02 07:02:53
      xyz.com
      2017-07-02
      3660
    
    
      20
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      1.0
      NaN
      0.0
      13.0
      2017-07-02 07:02:56
      example.com
      2017-07-02
      10778
    
    
      21
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 00:02:57
      example.com
      2017-07-02
      9246
    
    
      22
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      NaN
      0.0
      3.0
      2017-07-02 07:03:15
      example.com
      2017-07-02
      3787
    
    
      23
      0.0
      3.0
      0.0
      2.0
      0.0
      1.0
      1.0
      1.0
      1.0
      NaN
      1.0
      54.0
      2017-07-02 00:03:43
      xyz.com
      2017-07-02
      11410
    
    
      24
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      NaN
      0.0
      7.0
      2017-07-02 07:03:48
      example.com
      2017-07-02
      6977
    
    
      25
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      NaN
      0.0
      3.0
      2017-07-02 00:03:49
      example.com
      2017-07-02
      5198
    
    
      26
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      NaN
      0.0
      3.0
      2017-07-02 07:04:10
      example.com
      2017-07-02
      2532
    
    
      27
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      1.0
      5.0
      2017-07-02 00:04:16
      example.com
      2017-07-02
      2619
    
    
      28
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 07:04:22
      example.com
      2017-07-02
      5145
    
    
      29
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      6.0
      2017-07-02 07:04:25
      example.com
      2017-07-02
      1342
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      33158
      1.0
      0.0
      2.0
      1.0
      0.0
      1.0
      1.0
      1.0
      2.0
      0.0
      1.0
      57.0
      2017-07-03 23:53:08
      xyz.com
      2017-07-03
      11271
    
    
      33159
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      6.0
      2017-07-03 23:53:12
      xyz.com
      2017-07-03
      80
    
    
      33160
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      7.0
      2017-07-03 23:53:29
      example.com
      2017-07-03
      4701
    
    
      33161
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      11.0
      2017-07-03 23:54:06
      xyz.com
      2017-07-03
      10112
    
    
      33162
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      5.0
      2017-07-03 23:54:09
      xyz.com
      2017-07-03
      9739
    
    
      33163
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      4.0
      2017-07-03 23:54:26
      example.com
      2017-07-03
      8573
    
    
      33164
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      5.0
      2017-07-03 23:55:06
      xyz.com
      2017-07-03
      4842
    
    
      33165
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      6.0
      2017-07-03 23:55:20
      xyz.com
      2017-07-03
      7147
    
    
      33166
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      5.0
      2017-07-03 23:55:42
      example.com
      2017-07-03
      4902
    
    
      33167
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      9.0
      2017-07-03 23:55:52
      example.com
      2017-07-03
      10669
    
    
      33168
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      5.0
      2017-07-03 23:56:01
      xyz.com
      2017-07-03
      6206
    
    
      33169
      0.0
      1.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      10.0
      2017-07-03 23:56:11
      xyz.com
      2017-07-03
      10419
    
    
      33170
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      6.0
      2017-07-03 23:56:50
      xyz.com
      2017-07-03
      72
    
    
      33171
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      6.0
      2017-07-03 23:57:00
      example.com
      2017-07-03
      900
    
    
      33172
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      6.0
      2017-07-03 23:57:14
      xyz.com
      2017-07-03
      5124
    
    
      33173
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      4.0
      2017-07-03 23:57:18
      xyz.com
      2017-07-03
      384
    
    
      33174
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      6.0
      2017-07-03 23:57:45
      example.com
      2017-07-03
      4741
    
    
      33175
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      8.0
      2017-07-03 23:57:48
      example.com
      2017-07-03
      2697
    
    
      33176
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      7.0
      2017-07-03 23:57:53
      example.com
      2017-07-03
      6036
    
    
      33177
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      8.0
      2017-07-03 23:58:04
      xyz.com
      2017-07-03
      6448
    
    
      33178
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      8.0
      2017-07-03 23:58:06
      xyz.com
      2017-07-03
      6817
    
    
      33179
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      5.0
      2017-07-03 23:58:15
      example.com
      2017-07-03
      566
    
    
      33180
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      5.0
      2017-07-03 23:58:17
      xyz.com
      2017-07-03
      3674
    
    
      33181
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      5.0
      2017-07-03 23:58:30
      example.com
      2017-07-03
      6560
    
    
      33182
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      7.0
      2017-07-03 23:58:38
      example.com
      2017-07-03
      3712
    
    
      33183
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      6.0
      2017-07-03 23:59:10
      xyz.com
      2017-07-03
      3158
    
    
      33184
      2.0
      2.0
      0.0
      0.0
      1.0
      0.0
      0.0
      2.0
      3.0
      0.0
      0.0
      57.0
      2017-07-03 23:59:19
      xyz.com
      2017-07-03
      11189
    
    
      33185
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      3.0
      2017-07-03 23:59:32
      xyz.com
      2017-07-03
      7730
    
    
      33186
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      6.0
      2017-07-03 23:59:34
      xyz.com
      2017-07-03
      8204
    
    
      33187
      0.0
      0.0
      0.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      14.0
      2017-07-03 23:59:38
      xyz.com
      2017-07-03
      10245
    
  

33188 rows × 16 columns



In [64]:

    
# data structure
# - transaction_timestamp
# - domain name
# - item
# - price
#ffr = fr[item_lists + ['checkout_amount', 'timestamp', 'domain_name', 'transaction_date', 'customer_id']]
fr[fr[item_lists].astype(bool).sum(axis=1) == 1].drop(["error"], axis=1).head()









    Out[64]:







  
    
      
      Bignay
      Black/White Pepper
      European Grape
      Hazelnut
      Mabolo
      Natal Orange
      Prairie Potato
      Round Kumquat
      Ume
      Ylang-ylang
      app_version
      checkout_amount
      customer_id
      domain_name
      file_date
      placeholder
      timestamp
      transaction_date
      url
      website_id
    
  
  
    
      23207
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.1
      5.0
      4877
      xyz.com
      2017-07-03
      NaN
      2017-07-03 07:00:12
      2017-07-03
      http://xyz.com/checkout?Ylang-ylang=1
      124
    
    
      23208
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      1.1
      3.0
      3608
      xyz.com
      2017-07-03
      NaN
      2017-07-03 00:00:24
      2017-07-03
      http://xyz.com/checkout?Prairie+Potato=1
      124
    
    
      23209
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      1.1
      7.0
      2932
      example.com
      2017-07-03
      NaN
      2017-07-03 00:00:32
      2017-07-03
      http://www.example.com/store/?Round+Kumquat=1
      123
    
    
      23210
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      6.0
      1529
      example.com
      2017-07-03
      NaN
      2017-07-03 07:00:32
      2017-07-03
      http://www.example.com/store/?Bignay=1
      123
    
    
      23211
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      1.1
      7.0
      2525
      example.com
      2017-07-03
      NaN
      2017-07-03 00:00:37
      2017-07-03
      http://www.example.com/store/?Round+Kumquat=1
      123



In [183]:

    
cols = [item for item in item_lists if not "error" in item]
pricing_temp = fr[fr[cols].astype(bool).sum(axis=1) == 1].drop(["error"], axis=1)
pricing_temp.drop_duplicates(subset=cols + ["domain_name", "transaction_date"], inplace=True)
pricing_temp









    Out[183]:







  
    
      
      Bignay
      Black/White Pepper
      European Grape
      Hazelnut
      Mabolo
      Natal Orange
      Prairie Potato
      Round Kumquat
      Ume
      Ylang-ylang
      app_version
      checkout_amount
      customer_id
      domain_name
      file_date
      placeholder
      timestamp
      transaction_date
      url
      website_id
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      1.1
      6.0
      9418
      example.com
      2017-07-02
      NaN
      2017-07-02 07:00:35
      2017-07-02
      http://www.example.com/store/?Ume=1
      123
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      1.1
      3.0
      3872
      xyz.com
      2017-07-02
      NaN
      2017-07-02 07:00:37
      2017-07-02
      http://xyz.com/checkout?Prairie+Potato=1
      124
    
    
      2
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      4.0
      3090
      example.com
      2017-07-02
      NaN
      2017-07-02 07:00:47
      2017-07-02
      http://www.example.com/store/?Hazelnut=1
      123
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      1.1
      6.0
      9556
      xyz.com
      2017-07-02
      NaN
      2017-07-02 00:00:51
      2017-07-02
      http://xyz.com/checkout?Ume=1
      124
    
    
      4
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      6.0
      8845
      xyz.com
      2017-07-02
      NaN
      2017-07-02 00:01:02
      2017-07-02
      http://xyz.com/checkout?Bignay=1
      124
    
    
      5
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      5.0
      6008
      example.com
      2017-07-02
      NaN
      2017-07-02 00:01:14
      2017-07-02
      http://www.example.com/store/?Black%2FWhite+Pe...
      123
    
    
      6
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1.1
      6.0
      5312
      xyz.com
      2017-07-02
      NaN
      2017-07-02 00:01:18
      2017-07-02
      http://xyz.com/checkout?Natal+Orange=1
      124
    
    
      9
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      1.1
      3.0
      275
      example.com
      2017-07-02
      NaN
      2017-07-02 07:01:55
      2017-07-02
      http://www.example.com/store/?Prairie+Potato=1
      123
    
    
      12
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      6.0
      636
      example.com
      2017-07-02
      NaN
      2017-07-02 00:02:08
      2017-07-02
      http://www.example.com/store/?Bignay=1
      123
    
    
      16
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      1.1
      7.0
      1953
      example.com
      2017-07-02
      NaN
      2017-07-02 07:02:31
      2017-07-02
      http://www.example.com/store/?Round+Kumquat=1
      123
    
    
      19
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      1.1
      7.0
      3660
      xyz.com
      2017-07-02
      NaN
      2017-07-02 07:02:53
      2017-07-02
      http://xyz.com/checkout?Round+Kumquat=1
      124
    
    
      21
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1.1
      6.0
      9246
      example.com
      2017-07-02
      NaN
      2017-07-02 00:02:57
      2017-07-02
      http://www.example.com/store/?Natal+Orange=1
      123
    
    
      27
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      5.0
      2619
      example.com
      2017-07-02
      NaN
      2017-07-02 00:04:16
      2017-07-02
      http://www.example.com/store/?European+Grape=1
      123
    
    
      30
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      5.0
      4429
      xyz.com
      2017-07-02
      NaN
      2017-07-02 07:04:28
      2017-07-02
      http://xyz.com/checkout?European+Grape=1
      124
    
    
      35
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      4.0
      8801
      xyz.com
      2017-07-02
      NaN
      2017-07-02 00:04:51
      2017-07-02
      http://xyz.com/checkout?Hazelnut=1
      124
    
    
      37
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      8.0
      4802
      xyz.com
      2017-07-02
      NaN
      2017-07-02 00:05:24
      2017-07-02
      http://xyz.com/checkout?Mabolo=1
      124
    
    
      38
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      8.0
      7316
      example.com
      2017-07-02
      NaN
      2017-07-02 00:05:24
      2017-07-02
      http://www.example.com/store/?Mabolo=1
      123
    
    
      50
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.1
      5.0
      1563
      example.com
      2017-07-02
      NaN
      2017-07-02 07:06:56
      2017-07-02
      http://www.example.com/store/?Ylang-ylang=1
      123
    
    
      68
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.1
      5.0
      6717
      xyz.com
      2017-07-02
      NaN
      2017-07-02 00:09:56
      2017-07-02
      http://xyz.com/checkout?Ylang-ylang=1
      124
    
    
      70
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      5.0
      9004
      xyz.com
      2017-07-02
      NaN
      2017-07-02 07:10:03
      2017-07-02
      http://xyz.com/checkout?Black%2FWhite+Pepper=1
      124
    
    
      90
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      1.1
      12.0
      10296
      example.com
      2017-07-02
      NaN
      2017-07-02 00:12:41
      2017-07-02
      http://www.example.com/store/?Natal+Orange=2
      123
    
    
      290
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      1.1
      14.0
      10855
      example.com
      2017-07-02
      NaN
      2017-07-02 00:35:50
      2017-07-02
      http://www.example.com/store/?Round+Kumquat=2
      123
    
    
      301
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      8.0
      10267
      xyz.com
      2017-07-02
      NaN
      2017-07-02 00:37:11
      2017-07-02
      http://xyz.com/checkout?Hazelnut=2
      124
    
    
      318
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      1.1
      12.0
      10908
      xyz.com
      2017-07-02
      NaN
      2017-07-02 00:38:38
      2017-07-02
      http://xyz.com/checkout?Natal+Orange=2
      124
    
    
      475
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      12.0
      10706
      xyz.com
      2017-07-02
      NaN
      2017-07-02 07:59:06
      2017-07-02
      http://xyz.com/checkout?Bignay=2
      124
    
    
      628
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      1.1
      10.0
      10322
      example.com
      2017-07-02
      NaN
      2017-07-02 01:17:09
      2017-07-02
      http://www.example.com/store/?Ylang-ylang=2
      123
    
    
      679
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      10.0
      10084
      xyz.com
      2017-07-02
      NaN
      2017-07-02 01:23:09
      2017-07-02
      http://xyz.com/checkout?European+Grape=2
      124
    
    
      699
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      1.1
      12.0
      10751
      xyz.com
      2017-07-02
      NaN
      2017-07-02 08:25:57
      2017-07-02
      http://xyz.com/checkout?Ume=2
      124
    
    
      753
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      1.1
      6.0
      10873
      example.com
      2017-07-02
      NaN
      2017-07-02 08:32:02
      2017-07-02
      http://www.example.com/store/?Prairie+Potato=2
      123
    
    
      825
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      1.1
      14.0
      10498
      xyz.com
      2017-07-02
      NaN
      2017-07-02 01:39:59
      2017-07-02
      http://xyz.com/checkout?Round+Kumquat=2
      124
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      11704
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      12.0
      10832
      xyz.com
      2017-07-01
      NaN
      2017-07-01 00:19:27
      2017-07-01
      http://xyz.com/checkout?Bignay=2
      124
    
    
      11715
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      10.0
      10043
      xyz.com
      2017-07-01
      NaN
      2017-07-01 07:21:20
      2017-07-01
      http://xyz.com/checkout?European+Grape=2
      124
    
    
      11767
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      10.0
      10662
      example.com
      2017-07-01
      NaN
      2017-07-01 00:26:17
      2017-07-01
      http://www.example.com/store/?Black%2FWhite+Pe...
      123
    
    
      12132
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      1.1
      6.0
      10799
      example.com
      2017-07-01
      NaN
      2017-07-01 08:11:08
      2017-07-01
      http://www.example.com/store/?Prairie+Potato=2
      123
    
    
      12530
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      10.0
      10357
      example.com
      2017-07-01
      NaN
      2017-07-01 02:03:00
      2017-07-01
      http://www.example.com/store/?European+Grape=2
      123
    
    
      12609
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      10.0
      10164
      xyz.com
      2017-07-01
      NaN
      2017-07-01 02:13:11
      2017-07-01
      http://xyz.com/checkout?Black%2FWhite+Pepper=2
      124
    
    
      12673
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      1.1
      10.0
      10184
      example.com
      2017-07-01
      NaN
      2017-07-01 02:21:58
      2017-07-01
      http://www.example.com/store/?Ylang-ylang=2
      123
    
    
      12913
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      16.0
      10324
      example.com
      2017-07-01
      NaN
      2017-07-01 02:52:32
      2017-07-01
      http://www.example.com/store/?Mabolo=2
      123
    
    
      12975
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      8.0
      10317
      example.com
      2017-07-01
      NaN
      2017-07-01 10:00:37
      2017-07-01
      http://www.example.com/store/?Hazelnut=2
      123
    
    
      12978
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      1.1
      14.0
      10359
      example.com
      2017-07-01
      NaN
      2017-07-01 03:00:49
      2017-07-01
      http://www.example.com/store/?Round+Kumquat=2
      123
    
    
      13014
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      1.1
      6.0
      10651
      xyz.com
      2017-07-01
      NaN
      2017-07-01 03:06:41
      2017-07-01
      http://xyz.com/checkout?Prairie+Potato=2
      124
    
    
      13428
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      1.1
      12.0
      10207
      example.com
      2017-07-01
      NaN
      2017-07-01 03:56:27
      2017-07-01
      http://www.example.com/store/?Ume=2
      123
    
    
      13459
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      12.0
      10039
      example.com
      2017-07-01
      NaN
      2017-07-01 04:00:26
      2017-07-01
      http://www.example.com/store/?Bignay=2
      123
    
    
      13537
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      1.1
      10.0
      10920
      xyz.com
      2017-07-01
      NaN
      2017-07-01 04:08:41
      2017-07-01
      http://xyz.com/checkout?Ylang-ylang=2
      124
    
    
      13561
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      1.1
      14.0
      10489
      xyz.com
      2017-07-01
      NaN
      2017-07-01 11:12:38
      2017-07-01
      http://xyz.com/checkout?Round+Kumquat=2
      124
    
    
      13995
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      1.1
      12.0
      10904
      xyz.com
      2017-07-01
      NaN
      2017-07-01 12:07:08
      2017-07-01
      http://xyz.com/checkout?Ume=2
      124
    
    
      14316
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      16.0
      10364
      xyz.com
      2017-07-01
      NaN
      2017-07-01 05:47:39
      2017-07-01
      http://xyz.com/checkout?Mabolo=2
      124
    
    
      14607
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      1.1
      12.0
      10324
      example.com
      2017-07-01
      NaN
      2017-07-01 13:24:40
      2017-07-01
      http://www.example.com/store/?Natal+Orange=2
      123
    
    
      17349
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      8.0
      10117
      xyz.com
      2017-07-01
      NaN
      2017-07-01 11:55:22
      2017-07-01
      http://xyz.com/checkout?Hazelnut=2
      124
    
    
      23369
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      10.0
      10936
      example.com
      2017-07-03
      NaN
      2017-07-03 07:19:15
      2017-07-03
      http://www.example.com/store/?Black%2FWhite+Pe...
      123
    
    
      23852
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      8.0
      10033
      xyz.com
      2017-07-03
      NaN
      2017-07-03 08:17:39
      2017-07-03
      http://xyz.com/checkout?Hazelnut=2
      124
    
    
      23873
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      12.0
      10009
      example.com
      2017-07-03
      NaN
      2017-07-03 01:19:20
      2017-07-03
      http://www.example.com/store/?Bignay=2
      123
    
    
      23878
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.1
      -12.0
      10113
      xyz.com
      2017-07-03
      NaN
      2017-07-03 08:20:25
      2017-07-03
      http://xyz.com/checkout?Bignay=2
      124
    
    
      24209
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      1.1
      6.0
      10698
      xyz.com
      2017-07-03
      NaN
      2017-07-03 08:58:33
      2017-07-03
      http://xyz.com/checkout?Prairie+Potato=2
      124
    
    
      24279
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      1.1
      6.0
      10964
      example.com
      2017-07-03
      NaN
      2017-07-03 02:05:54
      2017-07-03
      http://www.example.com/store/?Prairie+Potato=2
      123
    
    
      25534
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      1.2
      -12.0
      10098
      xyz.com
      2017-07-03
      new_version
      2017-07-03 11:44:53
      2017-07-03
      http://xyz.com/checkout?Natal+Orange=2
      124
    
    
      26111
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.2
      16.0
      10889
      xyz.com
      2017-07-03
      new_version
      2017-07-03 05:57:41
      2017-07-03
      http://xyz.com/checkout?Mabolo=2
      124
    
    
      26435
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      1.2
      12.0
      10806
      example.com
      2017-07-03
      new_version
      2017-07-03 06:37:45
      2017-07-03
      http://store.example.com/?Ume=2
      123
    
    
      26874
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      1.2
      10.0
      10278
      example.com
      2017-07-03
      new_version
      2017-07-03 14:33:04
      2017-07-03
      http://store.example.com/?Ylang-ylang=2
      123
    
    
      28560
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      1.2
      12.0
      10225
      example.com
      2017-07-03
      new_version
      2017-07-03 18:02:53
      2017-07-03
      http://store.example.com/?Natal+Orange=2
      123
    
  

120 rows × 20 columns



In [180]:

    
pricing_temp.transpose()









    Out[180]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      9
      12
      16
      ...
      23852
      23873
      23878
      24209
      24279
      25534
      26111
      26435
      26874
      28560
    
  
  
    
      Bignay
      0
      0
      0
      0
      1
      0
      0
      0
      1
      0
      ...
      0
      2
      2
      0
      0
      0
      0
      0
      0
      0
    
    
      Black/White Pepper
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      European Grape
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      Hazelnut
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      ...
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      Mabolo
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      2
      0
      0
      0
    
    
      Natal Orange
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      2
      0
      0
      0
      2
    
    
      Prairie Potato
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      ...
      0
      0
      0
      2
      2
      0
      0
      0
      0
      0
    
    
      Round Kumquat
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      Ume
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      2
      0
      0
    
    
      Ylang-ylang
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      2
      0
    
    
      app_version
      1.1
      1.1
      1.1
      1.1
      1.1
      1.1
      1.1
      1.1
      1.1
      1.1
      ...
      1.1
      1.1
      1.1
      1.1
      1.1
      1.2
      1.2
      1.2
      1.2
      1.2
    
    
      checkout_amount
      6
      3
      4
      6
      6
      5
      6
      3
      6
      7
      ...
      8
      12
      -12
      6
      6
      -12
      16
      12
      10
      12
    
    
      customer_id
      9418
      3872
      3090
      9556
      8845
      6008
      5312
      275
      636
      1953
      ...
      10033
      10009
      10113
      10698
      10964
      10098
      10889
      10806
      10278
      10225
    
    
      domain_name
      example.com
      xyz.com
      example.com
      xyz.com
      xyz.com
      example.com
      xyz.com
      example.com
      example.com
      example.com
      ...
      xyz.com
      example.com
      xyz.com
      xyz.com
      example.com
      xyz.com
      xyz.com
      example.com
      example.com
      example.com
    
    
      file_date
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      ...
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
    
    
      placeholder
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      new_version
      new_version
      new_version
      new_version
      new_version
    
    
      timestamp
      2017-07-02 07:00:35
      2017-07-02 07:00:37
      2017-07-02 07:00:47
      2017-07-02 00:00:51
      2017-07-02 00:01:02
      2017-07-02 00:01:14
      2017-07-02 00:01:18
      2017-07-02 07:01:55
      2017-07-02 00:02:08
      2017-07-02 07:02:31
      ...
      2017-07-03 08:17:39
      2017-07-03 01:19:20
      2017-07-03 08:20:25
      2017-07-03 08:58:33
      2017-07-03 02:05:54
      2017-07-03 11:44:53
      2017-07-03 05:57:41
      2017-07-03 06:37:45
      2017-07-03 14:33:04
      2017-07-03 18:02:53
    
    
      transaction_date
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      2017-07-02
      ...
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
    
    
      url
      http://www.example.com/store/?Ume=1
      http://xyz.com/checkout?Prairie+Potato=1
      http://www.example.com/store/?Hazelnut=1
      http://xyz.com/checkout?Ume=1
      http://xyz.com/checkout?Bignay=1
      http://www.example.com/store/?Black%2FWhite+Pe...
      http://xyz.com/checkout?Natal+Orange=1
      http://www.example.com/store/?Prairie+Potato=1
      http://www.example.com/store/?Bignay=1
      http://www.example.com/store/?Round+Kumquat=1
      ...
      http://xyz.com/checkout?Hazelnut=2
      http://www.example.com/store/?Bignay=2
      http://xyz.com/checkout?Bignay=2
      http://xyz.com/checkout?Prairie+Potato=2
      http://www.example.com/store/?Prairie+Potato=2
      http://xyz.com/checkout?Natal+Orange=2
      http://xyz.com/checkout?Mabolo=2
      http://store.example.com/?Ume=2
      http://store.example.com/?Ylang-ylang=2
      http://store.example.com/?Natal+Orange=2
    
    
      website_id
      123
      124
      123
      124
      124
      123
      124
      123
      123
      123
      ...
      124
      123
      124
      124
      123
      124
      124
      123
      123
      123
    
  

20 rows × 120 columns



In [191]:

    
# data structure
# - transaction_timestamp
# - domain name
# - item
# - price
price_cols = []
for col in cols:
    price_cols.append(pricing_temp["checkout_amount"]/pricing_temp[col])
pricing = pd.concat(price_cols, axis=1)
pricing.columns = cols
price_cols = [col + "_price" for col in cols]
dfr = pricing_temp.join(pricing, rsuffix="_price")[price_cols + ["transaction_date", "domain_name"]]
dfr = dfr.replace([np.inf, -np.inf], np.nan).fillna(value=0)
pd.pivot_table(dfr, values=price_cols, index="transaction_date", columns="domain_name", aggfunc=np.max).transpose()
# pd.pivot_table(dfr, values=price_cols, index=["transaction_date"], columns=["domain_name"])
# dfr = dfr.replace([np.inf, -np.inf], np.nan).fillna(value=0)
# dfrt = dfr.transpose()
# t_new_cols = ["col_" + str(col) for col in dfrt.columns.tolist()]
# dfrt.columns = t_new_cols
# dfrt = dfrt.reset_index(drop=False).rename(columns={"index": "temp"})
# dfrt.transpose()
# pd.pivot_table(dfrt, columns=["transaction_date", "domain_name"], values=t_new_cols, index="temp")
# dfr["0"]
# px_cols = [col for col in dfr.columns if not "temp" in col]
# px_cols
# pd.pivot_table(dfr, columns=index, index=["transaction_date", "domain_nam"], values=px_cols, aggfunc=np.max)
# pricing["checkout_amount"].div(pricing[cols], axis="index")
# pricing["checkout_amount"]  #.div(pricing[cols], axis=0)
# there are now values > 1 for the items, so we replace them with 1
# pricing[cols] > 1
# pricing[pricing[cols] > 1]
# pricing.transpose()









    Out[191]:







  
    
      
      transaction_date
      2017-07-01
      2017-07-02
      2017-07-03
    
    
      
      domain_name
      
      
      
    
  
  
    
      Bignay_price
      example.com
      6.0
      6.0
      6.0
    
    
      xyz.com
      6.0
      6.0
      6.0
    
    
      Black/White Pepper_price
      example.com
      5.0
      5.0
      5.0
    
    
      xyz.com
      5.0
      5.0
      5.0
    
    
      European Grape_price
      example.com
      5.0
      5.0
      5.0
    
    
      xyz.com
      5.0
      5.0
      5.0
    
    
      Hazelnut_price
      example.com
      4.0
      4.0
      4.0
    
    
      xyz.com
      4.0
      4.0
      4.0
    
    
      Mabolo_price
      example.com
      8.0
      8.0
      8.0
    
    
      xyz.com
      8.0
      8.0
      8.0
    
    
      Natal Orange_price
      example.com
      6.0
      6.0
      6.0
    
    
      xyz.com
      6.0
      6.0
      6.0
    
    
      Prairie Potato_price
      example.com
      3.0
      3.0
      3.0
    
    
      xyz.com
      3.0
      3.0
      3.0
    
    
      Round Kumquat_price
      example.com
      7.0
      7.0
      7.0
    
    
      xyz.com
      7.0
      7.0
      7.0
    
    
      Ume_price
      example.com
      6.0
      6.0
      6.0
    
    
      xyz.com
      6.0
      6.0
      6.0
    
    
      Ylang-ylang_price
      example.com
      5.0
      5.0
      5.0
    
    
      xyz.com
      5.0
      5.0
      5.0



In [74]:

    
dd = {}
for col in item_lists:
    if not "error" in col:
        dd[col + "_price"] = np.max(pricing[col] * pricing["checkout_amount"])
dd









    Out[74]:





{'Bignay_price': 24.0,
 'Black/White Pepper_price': 20.0,
 'European Grape_price': 20.0,
 'Hazelnut_price': 16.0,
 'Mabolo_price': 32.0,
 'Natal Orange_price': 24.0,
 'Prairie Potato_price': 12.0,
 'Round Kumquat_price': 28.0,
 'Ume_price': 24.0,
 'Ylang-ylang_price': 20.0}



In [30]:

    
pd.pivot_table(pricing, values="checkout_amount", index=, columns="transaction_date")









    Out[30]:







  
    
      transaction_date
      2017-07-03
    
    
      domain_name
      
    
  
  
    
      example.com
      5.583508
    
    
      xyz.com
      5.542160



In [38]:

    
pricing.transpose()









    Out[38]:







  
    
      
      23207
      23208
      23209
      23210
      23211
      23212
      23213
      23214
      23215
      23217
      ...
      33176
      33177
      33178
      33179
      33180
      33181
      33182
      33183
      33185
      33186
    
  
  
    
      Bignay
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      Black/White Pepper
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      European Grape
      0
      0
      0
      0
      0
      1
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      Hazelnut
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      Mabolo
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      1
      1
      0
      0
      0
      0
      0
      0
      0
    
    
      Natal Orange
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      Prairie Potato
      0
      1
      0
      0
      0
      0
      0
      1
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      Round Kumquat
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      ...
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
    
    
      Ume
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      1
      0
      1
    
    
      Ylang-ylang
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      1
      1
      0
      0
      0
      0
      0
    
    
      app_version
      1.1
      1.1
      1.1
      1.1
      1.1
      1.1
      1.1
      1.1
      1.1
      1.1
      ...
      1.2
      1.2
      1.2
      1.2
      1.2
      1.2
      1.2
      1.2
      1.2
      1.2
    
    
      checkout_amount
      5
      3
      7
      6
      7
      5
      5
      3
      4
      3
      ...
      7
      8
      8
      5
      5
      5
      7
      6
      3
      6
    
    
      customer_id
      4877
      3608
      2932
      1529
      2525
      1545
      3821
      946
      3918
      7835
      ...
      6036
      6448
      6817
      566
      3674
      6560
      3712
      3158
      7730
      8204
    
    
      domain_name
      xyz.com
      xyz.com
      example.com
      example.com
      example.com
      example.com
      example.com
      xyz.com
      example.com
      example.com
      ...
      example.com
      xyz.com
      xyz.com
      example.com
      xyz.com
      example.com
      example.com
      xyz.com
      xyz.com
      xyz.com
    
    
      error
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      file_date
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      ...
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
    
    
      placeholder
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      new_version
      new_version
      new_version
      new_version
      new_version
      new_version
      new_version
      new_version
      new_version
      new_version
    
    
      timestamp
      2017-07-03 07:00:12
      2017-07-03 00:00:24
      2017-07-03 00:00:32
      2017-07-03 07:00:32
      2017-07-03 00:00:37
      2017-07-03 07:00:41
      2017-07-03 00:00:49
      2017-07-03 00:00:49
      2017-07-03 07:00:59
      2017-07-03 07:01:12
      ...
      2017-07-03 23:57:53
      2017-07-03 23:58:04
      2017-07-03 23:58:06
      2017-07-03 23:58:15
      2017-07-03 23:58:17
      2017-07-03 23:58:30
      2017-07-03 23:58:38
      2017-07-03 23:59:10
      2017-07-03 23:59:32
      2017-07-03 23:59:34
    
    
      transaction_date
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      ...
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
      2017-07-03
    
    
      url
      http://xyz.com/checkout?Ylang-ylang=1
      http://xyz.com/checkout?Prairie+Potato=1
      http://www.example.com/store/?Round+Kumquat=1
      http://www.example.com/store/?Bignay=1
      http://www.example.com/store/?Round+Kumquat=1
      http://www.example.com/store/?European+Grape=1
      http://www.example.com/store/?European+Grape=1
      http://xyz.com/checkout?Prairie+Potato=1
      http://www.example.com/store/?Hazelnut=1
      http://www.example.com/store/?Prairie+Potato=1
      ...
      http://store.example.com/?Round+Kumquat=1
      http://xyz.com/checkout?Mabolo=1
      http://xyz.com/checkout?Mabolo=1
      http://store.example.com/?Ylang-ylang=1
      http://xyz.com/checkout?Ylang-ylang=1
      http://store.example.com/?European+Grape=1
      http://store.example.com/?Round+Kumquat=1
      http://xyz.com/checkout?Ume=1
      http://xyz.com/checkout?Prairie+Potato=1
      http://xyz.com/checkout?Ume=1
    
    
      website_id
      124
      124
      123
      123
      123
      123
      123
      124
      123
      123
      ...
      123
      124
      124
      123
      124
      123
      123
      124
      124
      124
    
  

21 rows × 8693 columns



In [18]:

    
pd.pivot_table(fr, values="checkout_amount", index="transaction_date", columns="domain_name",
               aggfunc=[np.sum], margins=True)



In [19]:



In [20]:

    
# print(df.loc[df['B'].isin(['one','three'])])
for col in dfr.columns.tolist():
    dfr[col]









    



a
b
c
d



In [21]:

    
two = pd.DataFrame({'one' : pd.Series([10, 0, 10, 9], index=['a', 'b', 'c', 'd']), 'two' : pd.Series([0, 0, 10., 4.6], index=['a', 'b', 'c', 'd']), 'three' : pd.Series([5, -1, 7, -1], index=['a', 'b', 'c', 'd'])})



In [22]:

    
two



In [26]:

    
dfr[dfr.astype(bool).sum(axis=1) == 1]



In [25]:

    
gr = fr[item_lists + ["transaction_date", "domain_name", "checkout_amount"]]



In [26]:

    
frame = fr[item_lists + ["transaction_date", "domain_name", "checkout_amount"]]
gr = frame.groupby(["transaction_date", "domain_name"])



In [33]:

    
gb_frames = []
for name, group in gr:
    gb_frames.append({"name": name, "frame": group})



In [35]:

    
gb_frames[0]["name"][0]









    Out[35]:





'2017-07-01'



In [37]:

    
corr = gb_frames[0]["frame"][item_lists].corr()
ax = sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values, linewidths=.5,
                 cmap="YlGnBu")
ax = plt.title("Test")
plt.tight_layout()



In [ ]:

	Bignay	Hazelnut	Prairie Potato	Ume	...	checkout_amount	customer_id	domain_name	error	file_date	placeholder	timestamp	transaction_date	url	website_id
0	0.0	0.0	0.0	1.0	...	6.0	9418	example.com	NaN	2017-07-02	NaN	2017-07-02 07:00:35	2017-07-02	http://www.example.com/store/?Ume=1	123
1	0.0	0.0	1.0	0.0	...	3.0	3872	xyz.com	NaN	2017-07-02	NaN	2017-07-02 07:00:37	2017-07-02	http://xyz.com/checkout?Prairie+Potato=1	124
2	0.0	1.0	0.0	0.0	...	4.0	3090	example.com	NaN	2017-07-02	NaN	2017-07-02 07:00:47	2017-07-02	http://www.example.com/store/?Hazelnut=1	123
3	0.0	0.0	0.0	1.0	...	6.0	9556	xyz.com	NaN	2017-07-02	NaN	2017-07-02 00:00:51	2017-07-02	http://xyz.com/checkout?Ume=1	124
4	1.0	0.0	0.0	0.0	...	6.0	8845	xyz.com	NaN	2017-07-02	NaN	2017-07-02 00:01:02	2017-07-02	http://xyz.com/checkout?Bignay=1	124

	sum
domain_name	example.com	xyz.com
transaction_date
2017-07-01	172874.0	50641.0
2017-07-02	127786.0	55966.0
2017-07-03	64599.0	116984.0

	Bignay	Hazelnut	Prairie Potato	Ume
0	0.0	0.0	0.0	1.0
1	0.0	0.0	1.0	0.0
2	0.0	1.0	0.0	0.0
3	0.0	0.0	0.0	1.0
4	1.0	0.0	0.0	0.0

	Bignay	Black/White Pepper	European Grape	Hazelnut	Mabolo	Natal Orange	Prairie Potato	Round Kumquat	Ume	Ylang-ylang	error
0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN
1	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN
4	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	NaN	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
6	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN
7	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
8	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN
9	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN
10	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN
11	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
12	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
14	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN
15	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
16	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN
17	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN
18	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN
19	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN
20	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	[1]	NaN	NaN
21	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN
22	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN
23	NaN	[1]	[1]	[3]	[2]	[1]	[1]	[1]	NaN	NaN	NaN
24	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN
25	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN
26	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN
27	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
28	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
29	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...
33158	[1]	[1]	[1]	NaN	[1]	[1]	[1]	[2]	NaN	[2]	NaN
33159	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN
33160	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN
33161	[1]	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
33162	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
33163	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN
33164	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN
33165	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN
33166	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
33167	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	[1]	NaN	NaN
33168	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN
33169	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	[1]	NaN	NaN
33170	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN
33171	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
33172	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN
33173	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN
33174	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN
33175	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN	NaN
33176	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN
33177	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN	NaN
33178	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN	NaN
33179	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN
33180	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN
33181	NaN	NaN	[1]	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
33182	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN
33183	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN
33184	[2]	[2]	NaN	[2]	NaN	NaN	NaN	[3]	[1]	NaN	NaN
33185	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	NaN
33186	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[1]	NaN	NaN
33187	NaN	NaN	NaN	NaN	[1]	NaN	NaN	NaN	[1]	NaN	NaN

	Bignay	Hazelnut	Prairie Potato	Ume
0	0	0	0	[1]
1	0	0	[1]	0
2	0	[1]	0	0
3	0	0	0	[1]
4	[1]	0	0	0

	Bignay	Prairie Potato	Round Kumquat	Ylang-ylang	app_version	checkout_amount	customer_id	domain_name	file_date	placeholder	timestamp	transaction_date	url	website_id
23207	0.0	0.0	0.0	1.0	1.1	5.0	4877	xyz.com	2017-07-03	NaN	2017-07-03 07:00:12	2017-07-03	http://xyz.com/checkout?Ylang-ylang=1	124
23208	0.0	1.0	0.0	0.0	1.1	3.0	3608	xyz.com	2017-07-03	NaN	2017-07-03 00:00:24	2017-07-03	http://xyz.com/checkout?Prairie+Potato=1	124
23209	0.0	0.0	1.0	0.0	1.1	7.0	2932	example.com	2017-07-03	NaN	2017-07-03 00:00:32	2017-07-03	http://www.example.com/store/?Round+Kumquat=1	123
23210	1.0	0.0	0.0	0.0	1.1	6.0	1529	example.com	2017-07-03	NaN	2017-07-03 07:00:32	2017-07-03	http://www.example.com/store/?Bignay=1	123
23211	0.0	0.0	1.0	0.0	1.1	7.0	2525	example.com	2017-07-03	NaN	2017-07-03 00:00:37	2017-07-03	http://www.example.com/store/?Round+Kumquat=1	123

	0	1	2	3	4	5	6	9	12	16	...	23852	23873	23878	24209	24279	25534	26111	26435	26874	28560
Bignay	0	0	0	0	1	0	0	0	1	0	...	0	2	2	0	0	0	0	0	0	0
Black/White Pepper	0	0	0	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
European Grape	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
Hazelnut	0	0	1	0	0	0	0	0	0	0	...	2	0	0	0	0	0	0	0	0	0
Mabolo	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	2	0	0	0
Natal Orange	0	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	2	0	0	0	2
Prairie Potato	0	1	0	0	0	0	0	1	0	0	...	0	0	0	2	2	0	0	0	0	0
Round Kumquat	0	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
Ume	1	0	0	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	2	0	0
Ylang-ylang	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	2	0
app_version	1.1	1.1	1.1	1.1	1.1	1.1	1.1	1.1	1.1	1.1	...	1.1	1.1	1.1	1.1	1.1	1.2	1.2	1.2	1.2	1.2
checkout_amount	6	3	4	6	6	5	6	3	6	7	...	8	12	-12	6	6	-12	16	12	10	12
customer_id	9418	3872	3090	9556	8845	6008	5312	275	636	1953	...	10033	10009	10113	10698	10964	10098	10889	10806	10278	10225
domain_name	example.com	xyz.com	example.com	xyz.com	xyz.com	example.com	xyz.com	example.com	example.com	example.com	...	xyz.com	example.com	xyz.com	xyz.com	example.com	xyz.com	xyz.com	example.com	example.com	example.com
file_date	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	...	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03
placeholder	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	new_version	new_version	new_version	new_version	new_version
timestamp	2017-07-02 07:00:35	2017-07-02 07:00:37	2017-07-02 07:00:47	2017-07-02 00:00:51	2017-07-02 00:01:02	2017-07-02 00:01:14	2017-07-02 00:01:18	2017-07-02 07:01:55	2017-07-02 00:02:08	2017-07-02 07:02:31	...	2017-07-03 08:17:39	2017-07-03 01:19:20	2017-07-03 08:20:25	2017-07-03 08:58:33	2017-07-03 02:05:54	2017-07-03 11:44:53	2017-07-03 05:57:41	2017-07-03 06:37:45	2017-07-03 14:33:04	2017-07-03 18:02:53
transaction_date	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	2017-07-02	...	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03	2017-07-03
url	http://www.example.com/store/?Ume=1	http://xyz.com/checkout?Prairie+Potato=1	http://www.example.com/store/?Hazelnut=1	http://xyz.com/checkout?Ume=1	http://xyz.com/checkout?Bignay=1	http://www.example.com/store/?Black%2FWhite+Pe...	http://xyz.com/checkout?Natal+Orange=1	http://www.example.com/store/?Prairie+Potato=1	http://www.example.com/store/?Bignay=1	http://www.example.com/store/?Round+Kumquat=1	...	http://xyz.com/checkout?Hazelnut=2	http://www.example.com/store/?Bignay=2	http://xyz.com/checkout?Bignay=2	http://xyz.com/checkout?Prairie+Potato=2	http://www.example.com/store/?Prairie+Potato=2	http://xyz.com/checkout?Natal+Orange=2	http://xyz.com/checkout?Mabolo=2	http://store.example.com/?Ume=2	http://store.example.com/?Ylang-ylang=2	http://store.example.com/?Natal+Orange=2
website_id	123	124	123	124	124	123	124	123	123	123	...	124	123	124	124	123	124	124	123	123	123

	transaction_date	2017-07-01	2017-07-02	2017-07-03
	domain_name
Bignay_price	example.com	6.0	6.0	6.0
Bignay_price	xyz.com	6.0	6.0	6.0
Black/White Pepper_price	example.com	5.0	5.0	5.0
Black/White Pepper_price	xyz.com	5.0	5.0	5.0
European Grape_price	example.com	5.0	5.0	5.0
European Grape_price	xyz.com	5.0	5.0	5.0
Hazelnut_price	example.com	4.0	4.0	4.0
Hazelnut_price	xyz.com	4.0	4.0	4.0
Mabolo_price	example.com	8.0	8.0	8.0
Mabolo_price	xyz.com	8.0	8.0	8.0
Natal Orange_price	example.com	6.0	6.0	6.0
Natal Orange_price	xyz.com	6.0	6.0	6.0
Prairie Potato_price	example.com	3.0	3.0	3.0
Prairie Potato_price	xyz.com	3.0	3.0	3.0
Round Kumquat_price	example.com	7.0	7.0	7.0
Round Kumquat_price	xyz.com	7.0	7.0	7.0
Ume_price	example.com	6.0	6.0	6.0
Ume_price	xyz.com	6.0	6.0	6.0
Ylang-ylang_price	example.com	5.0	5.0	5.0
Ylang-ylang_price	xyz.com	5.0	5.0	5.0

	Bignay	Hazelnut	Prairie Potato	Ume
0	0.0	0.0	0.0	1.0
1	0.0	0.0	1.0	0.0
2	0.0	1.0	0.0	0.0
3	0.0	0.0	0.0	1.0
4	1.0	0.0	0.0	0.0

	Bignay	Hazelnut	Prairie Potato	Ume
0	0	0	0	[1]
1	0	0	[1]	0
2	0	[1]	0	0
3	0	0	0	[1]
4	[1]	0	0	0

	Bignay	Hazelnut	Prairie Potato	Ume
0	0.0	0.0	0.0	1.0
1	0.0	0.0	1.0	0.0
2	0.0	1.0	0.0	0.0
3	0.0	0.0	0.0	1.0
4	1.0	0.0	0.0	0.0

	Bignay	Hazelnut	Prairie Potato	Ume
0	0	0	0	[1]
1	0	0	[1]	0
2	0	[1]	0	0
3	0	0	0	[1]
4	[1]	0	0	0