In [1]:
import pandas as pd
In [2]:
import numpy as np
In [3]:
import matplotlib.pyplot as plt
In [4]:
%matplotlib inline
In [5]:
prod_df = pd.read_csv('/home/saisons/Code/zazzle-product-analysis/inputs/cl_pr_lst.csv',
dtype={'category_id': np.str, 'product_id': np.str, 'num_of_products': np.int},
keep_default_na=False)
prod_df.head()
Out[5]:
In [6]:
cats_df = pd.read_csv('/home/saisons/Code/zazzle-product-analysis/inputs/cl_cat_lst.csv',
dtype={'category_id': np.str, 'product_id': np.str},
keep_default_na=False)
cats_df.head()
Out[6]:
In [7]:
sale_df = pd.read_csv('/home/saisons/Code/zazzle-product-analysis/inputs/cur_zsales.csv',
dtype={'category_id': np.str, 'product_id': np.str, 'num_of_products': np.int},#, 'royalty_usd': np.float},
keep_default_na=False)
sale_df.head()
Out[7]:
In [8]:
saletotesprod_df = pd.read_csv('/home/saisons/Code/zazzle-product-analysis/inputs/zsales_totes.csv',
dtype={'category_id': np.str, 'product_id': np.str},
keep_default_na=False)
saletotesprod_df.head()
Out[8]:
prod_df_example = prod_df.head(0)
cats_df_example = cats_df.head(0)
sale_df_example = sale_df.head(0)
saletotesprod_df_example = saletotesprod_df.head(0)
prod_df_example.to_csv("/home/saisons/Code/zazzle-product-analysis/inputs/prod_df_example.csv")
cats_df_example.to_csv("/home/saisons/Code/zazzle-product-analysis/inputs/cats_df_example.csv")
sale_df_example.to_csv("/home/saisons/Code/zazzle-product-analysis/inputs/sale_df_example.csv")
saletotesprod_df_example.to_csv("/home/saisons/Code/zazzle-product-analysis/inputs/saletotes_prod_df_example.csv")
top_30 = pd.read_csv('/home/saisons/Downloads/top_sellers30_srtd.csv', dtype={'category_id': np.str})
In [9]:
prod_df.info()
In [10]:
cats_df.info()
In [11]:
sale_df.info()
In [12]:
saletotesprod_df.info()
top_30.info()
this line not needed, was used to confirm category ID data fix cats_df[cats_df.category_id == '196094447518203739']
In [13]:
prodjn1 = pd.merge(prod_df, cats_df, on='category_id', left_index=True, how='outer')
prodjn1.head()
Out[13]:
In [14]:
prodjn1.info()
In [15]:
prodjn1.drop(['store_y'],inplace=True,axis=1)
prodjn1.drop(['category_string_y'],inplace=True,axis=1)
prodjn1.drop(['category_1_y'],inplace=True,axis=1)
prodjn1.drop(['category_2_y'],inplace=True,axis=1)
prodjn1.drop(['category_3_y'],inplace=True,axis=1)
prodjn1.drop(['category_4_y'],inplace=True,axis=1)
prodjn1.head()
Out[15]:
In [16]:
prodjn1.info()
In [17]:
prodjn2 = pd.merge(prodjn1, saletotesprod_df, on='product_id', left_index=True, how='outer')
prodjn2.head()
Out[17]:
In [18]:
prodjn2.info()
In [19]:
prod_info1 = prodjn2.groupby(
["product_id", "category_id"]
).num_sales.sum().reset_index().sort_values("num_sales", ascending=False)
prod_info1.head()
Out[19]:
In [20]:
prod_info2 = prodjn2.groupby(
["product_id", "category_id"]
).quant_sold.sum().reset_index().sort_values("quant_sold", ascending=False)
prod_info2.head()
Out[20]:
In [21]:
prod_info3 = prodjn2.groupby(
["product_id", "category_id"]
).royal_total.sum().reset_index().sort_values("royal_total", ascending=False)
prod_info3.head()
Out[21]:
In [22]:
prod_info1.info()
In [23]:
prod_info2.info()
In [24]:
prod_info3.info()
prod_info1_fix = prod_info1.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) prod_info1_fix.info()
prod_info2_fix = prod_info2.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) prod_info2_fix.info()
prod_info3_fix = prod_info3.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) prod_info3_fix.info()
In [25]:
prod_info_jn1 = pd.merge(prod_info1, prod_info2, on='product_id', left_index=True, how='outer')
prod_info_jn1.drop(['category_id_y'],inplace=True,axis=1)
prod_info_jn1.head()
Out[25]:
In [26]:
prod_info_jn1.info()
In [27]:
prod_info_jn2 = pd.merge(prod_info_jn1, prod_info3, on='product_id', left_index=True, how='outer')
prod_info_jn2.drop(['category_id_x'],inplace=True,axis=1)
prod_info_jn2.head()
Out[27]:
In [28]:
prod_info_jn2.info()
In [29]:
cat_info_1 = prodjn2.groupby(
["category_id"]
).royal_total.sum().reset_index().sort_values("royal_total", ascending=False)
cat_info_1.head()
Out[29]:
In [30]:
cat_info_1.info()
In [31]:
cat_info_2 = prodjn2.groupby(
["category_id"]
).num_sales.sum().reset_index().sort_values("num_sales", ascending=False)
cat_info_2.head()
Out[31]:
In [32]:
cat_info_2.info()
In [33]:
cat_info_3 = prodjn2.groupby(
["category_id"]
).quant_sold.sum().reset_index().sort_values("quant_sold", ascending=False)
cat_info_3.head()
Out[33]:
In [34]:
cat_info_3.info()
In [35]:
cat_info_jn1 = pd.merge(cat_info_1, cat_info_2, on='category_id', left_index=True, how='outer')
#prod_info_jn2.drop(['category_id_x'],inplace=True,axis=1)
cat_info_jn1.head()
Out[35]:
In [36]:
cat_info_jn1.info()
In [37]:
cat_info_jn2 = pd.merge(cat_info_jn1, cat_info_3, on='category_id', left_index=True, how='outer')
#prod_info_jn2.drop(['category_id_x'],inplace=True,axis=1)
cat_info_jn2.head()
Out[37]:
In [38]:
cat_info_jn2.info()
In [39]:
cat_info_jn3 = pd.merge(cats_df, cat_info_jn2, on='category_id', left_index=True, how='outer')
#prod_info_jn2.drop(['category_id_x'],inplace=True,axis=1)
cat_info_jn3.head()
Out[39]:
In [40]:
cat_info_jn3.info()
In [41]:
cat_info_jn3_srt = cat_info_jn3.sort_values("quant_sold", ascending=False)
# cat_info_jn3_srt.drop([1,inplace=True,axis=1)
# cat_info_jn3_srt.drop(['index'],inplace=True,axis=1)
cat_info_jn3.head()
Out[41]:
In [42]:
cat_info_jn3_srt.info()
In [43]:
cat_info_jn3.to_csv("/home/saisons/Code/zazzle-product-analysis/outputs/cat_info_jn3.csv")
In [44]:
prodjn2.to_csv("/home/saisons/Code/zazzle-product-analysis/outputs/final_output_products.csv")
In [45]:
cat_list = ['196717574142457136', '196739766766069967', '196990207237662364', '196959432771907138', '196189645389229864']
In [46]:
five_cats_list = prodjn2[prodjn2['category_id'].isin(cat_list)]
five_cats_list.head(5)
Out[46]:
In [47]:
five_cats_list.info()
In [48]:
five_cats_list.to_csv("/home/saisons/Code/zazzle-product-analysis/outputs/five_cats_prods_list.csv")
In [51]:
full_prod_list = prodjn2.groupby(
["final_product_type"]
).quant_sold.sum().reset_index().sort_values("quant_sold", ascending=False)
full_prod_list.head()
Out[51]:
In [52]:
full_prod_list.info()
In [53]:
full_prod_list.to_csv("/home/saisons/Code/zazzle-product-analysis/outputs/full_prod_list.csv")
In [ ]:
prodjnfix = pd.merge(cats_df, prod_df, on="category_id", left_index=True, right_index=True, how='outer')
prodjnfix.head()
top_sellers = prodjn2.groupby( ["category_id", "num_of_products"] ).royal_total.sum().reset_index().sort_values("royal_total", ascending=False).head(50)
top_sellers.head(5).reset_index()
merged1 = pd.merge(prod_df, cats_df, on="category_id") merged1 = pd.merge(prod_df, cats_df[['#_of_products']], left_index=True, right_index=True, how='outer')
chart3 = top_sellers.head(5).plot(x="category_id", y="royal_total")
top_sellers_50 = top_sellers.head(50)
top_sellers_50.reset_index()
top_50_stats = pd.merge(top_sellers_50, cats_df, on='category_id', right_index=True, how='outer')
top_50_stats.head(5)
top_50_stats.info()
top_50_stats.drop(['num_of_products_y'],inplace=True,axis=1)
top_50_stats.info()
top_50_stats.sort_values("num_of_products_x", ascending=False)
top_50_stats.to_csv("top_50_stats.csv")
weight = top_50_stats.assign(weighted_value = lambda x: x.royal_total/x.num_of_products_x)
top_30_designs = pd.merge(top_30, cats_df, on='category_id', left_index=True, right_index=True, how='outer')
top_30_designs.info()
top_30_designs.head(5)
top_30_designs.info()
top_30_designs.head()
weight.sort_values("Weighted_Value", ascending=False).head(5)
weight.to_csv("weighted_prods.csv")
fig = plt.figure() # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.
width = 0.4
top_sellers.head(5).Num_of_products.plot(kind='line', color='red', ax=ax, width=width, position=1) top_sellers.head(5).royal_total.plot(kind='line', color='blue', ax=ax2, width=width, position=0)
ax.set_ylabel('Num_of_Products') ax2.set_ylabel('royal_total')
plt.show()
top_sellers.head(5).to_csv("top_supporters5.csv")
fig, ax1 = plt.subplots() x = ()"category_id') ax1.plot('category_id') ax1.set_xlabel('time (s)')
ax1.set_ylabel('exp', color='b') ax1.tick_params('y', colors='b')
ax2 = ax1.twinx() s2 = np.sin(2 np.pi t) ax2.plot(t, s2, 'r.') ax2.set_ylabel('sin', color='r') ax2.tick_params('y', colors='r')
fig.tight_layout() plt.show()
chart3 = top_sellers.groupby("category_id").head(5) x=top_sellers.grouby("category_id").head(5) fig,ax=plt.subplots() ax2=ax.twinx() x.plot(ax=ax) y.plot(ax=ax2,style='g:') plt.show()
merged2.groupby("category_id").royal_total.sum().reset_index().sort_values("royal_total", ascending=False)
Below line # out as only needed to verify data integrity of column delete
del merged1['category_string_y'] del merged1['store_y'] del merged1['category_1_y'] del merged1['category_2_y'] del merged1['category_3_y'] del merged1['category_4_y']
merged3 = pd.merge(cats_df, saletotesprod_df, on="product_id")
merged2.sort_values("royal_total", ascending=False)
def combine_names(row): if row.true_final.startswith('Business'): return 'Business Card' if row.true_final.startswith('Coaster'): return 'Coaster' if row.true_final.startswith('Coaster'): return 'Coaster' return row.true_final return row.true_final
def combine_names1(row): if row.true_final.startswith('Coaster'): return 'Coaster' return row.true_final
merged2.apply(combine_names, axis=1)
merged2.apply(combine_names1, axis=1)
~~ Need to Create:~~
Separate Table of Category Data Totals from Pivot Table of Sale Info
print(merged2).reset_index()
merged1.to_csv('merged1.csv')
merged2.to_csv('merged2.csv')
merged3
merged2.info()
prod_df.info()
merged1.info()