In [3]:
#!sudo pip install pymc3
import pymc3 as pm
import pandas as pd
import numpy as np
import psycopg2
import os
import json
from tpot import TPOTClassifier
from sklearn.metrics import classification_report
In [4]:
conn = psycopg2.connect(
user = os.environ['REDSHIFT_USER']
,password = os.environ['REDSHIFT_PASS']
,port = os.environ['REDSHIFT_PORT']
,host = os.environ['REDSHIFT_HOST']
,database = 'tradesy'
)
query = """
select
purchase_dummy
,shipping_price_ratio
,asking_price
,price_level
,brand_score
,brand_size
,a_over_b
,favorite_count
,has_blurb
,has_image
,seasonal_component
,description_length
,product_category_accessories
,product_category_shoes
,product_category_bags
,product_category_tops
,product_category_dresses
,product_category_weddings
,product_category_bottoms
,product_category_outerwear
,product_category_jeans
,product_category_activewear
,product_category_suiting
,product_category_swim
from saleability_model_v2
limit 50000
"""
df = pd.read_sql(query, conn)
target = 'purchase_dummy'
domain = filter(lambda x: x != target, df.columns.values)
df = df.astype(float)
In [25]:
data = df['purchase_dummy']
uniques = pd.Series(data.value_counts().index.values)
integer_comparison = uniques == uniques.apply(lambda x: int(x))
integer_comparison.value_counts(normalize=True)
Out[25]:
In [45]:
def infer_data_types(df):
data_types = {}
for col in df.columns.values:
data = df[col]
uniques = pd.Series(data.value_counts().index.values)
n_uniques = uniques.shape[0]
if n_uniques == 1:
Type = 'Constant'
Dist = 'Constant'
elif n_uniques == 2:
Type = 'Binary'
Dist = 'Bernoulli'
elif n_uniques < 10:
Type = 'Categorical'
Dist = 'general-bernoulli'
else:
integer_comparison = uniques == uniques.apply(lambda x: int(x))
integer_comparison_statistics = integer_comparison.value_counts(normalize=True)
if True in integer_comparison_statistics.index.values:
if integer_comparison_statistics.loc[True] > .9:
Type = 'Count/Integer'
Dist = 'Unknown'
else:
Type = 'Float'
Dist = "Unknown"
else:
Type = 'Float'
Dist = "Unknown"
data_types[col] = {'Type':Type, 'Dist':Dist}
return pd.DataFrame(data_types).T
In [46]:
data_dict = infer_data_types(df)
In [62]:
def data_dictionary_to_pymc_mixture(n_components = 1):
for i in range(len(data_dict)):
row = data_dict.iloc[i]
var_name = data_dict.index.values[i]
Dist, Type = row.loc['Dist'], row.loc['Type']
if Type == 'Binary':
definiton = "{} = pm.Bernoulli('{}', p_{}[mixture_component], observed = df['{}'])".format(*[var_name, var_name,var_name, var_name])
print definiton
In [63]:
data_dictionary_to_pymc_mixture(data_dict)
In [69]:
df['log_price'] = df.asking_price.apply(lambda x: np.log(x + 1))
In [94]:
from sklearn.mixture import GMM
model = GMM?
In [95]:
model = GMM(10)
model.fit(df)
Out[95]:
In [103]:
model.predict(df.iloc[0])
Out[103]:
In [79]:
n_components = 2
with pm.Model() as model:
#w = pm.Beta('prior_mixture' ,1.0, 1.0, )
mixture_component = pm.Categorical('component', np.ones(n_components) / n_components)
p_product_category_accessories = pm.Beta('prior_accessories', 1.0, 1.0, shape = n_components)
asking_price_mean = pm.Normal('ap_mean_prior', 1, 1, shape = n_components)
asking_price_sd = pm.HalfCauchy('ap_sd_prior', 1, shape = n_components)
product_category_accessories = pm.Bernoulli('product_category_accessories', p_product_category_accessories[mixture_component], observed = df['product_category_accessories'])
asking_price = pm.Normal('asking_price', asking_price_mean[mixture_component], asking_price_sd[mixture_component],observed = df['log_price'])
In [83]:
with model:
start = pm.find_MAP()
step = pm.NUTS(state=start)
trace = pm.sample(2000, step, start=start, progressbar=True)
In [86]:
trace.varnames
Out[86]:
In [92]:
pd.DataFrame(trace.get_values('ap_mean_prior')).plot()
Out[92]:
In [93]:
df.groupby('product_category_accessories')['log_price'].mean()
Out[93]:
In [90]:
pd.DataFrame(trace.get_values('prior_accessories')).plot()
Out[90]:
In [72]:
%matplotlib inline
df.log_price.hist(bins = [j for j in range(30)])
Out[72]:
In [68]:
pm.Lognormal?
In [ ]: