To use the Shopsense API, you need to first sign up to get an API Key http://shopsense.shopstyle.com/landing.
To understand more about the structure of their dataoutput, take a look at their documentation
I saved my API key in mykeys.py
In [1]:
import mykeys
In [2]:
import urllib2
import json
In [3]:
url = "http://api.shopstyle.com/api/v2/"
ties = "{}products?pid={}&cat=mens-ties&limit=100".format(url, mykeys.apiKey)
jsonResponse = urllib2.urlopen(ties)
data = json.load(jsonResponse)
We want some of the information that they give us about how much data is available to collect.
In [4]:
total = data['metadata']['total']
limit = data['metadata']['limit']
offset = data['metadata']['offset']
pages = (total / limit)
print "{} total, {} per page. {} pages to process".format(total, limit, pages)
In [5]:
import pandas as pd
tmp = pd.DataFrame(data['products'])
dfs = {}
for page in range(pages+1):
allTies = "{}products?pid={}&cat=mens-ties&limit=100&offset={}&sort=popular".format(url, mykeys.apiKey, (page*50))
jsonResponse = urllib2.urlopen(allTies)
data = json.load(jsonResponse)
dfs[page] = pd.DataFrame(data['products'])
dfs.keys()
df = pd.concat(dfs, ignore_index=True)
In [6]:
# Cleaning records, removing duplicates
df = df.drop_duplicates('id')
df['priceLabel'] = df['priceLabel'].str.replace('$', '')
df['priceLabel'] = df['priceLabel'].astype(float)
In [7]:
df.dtypes
Out[7]:
In [8]:
#split brand into 2 columns
def breakId(x,y=0):
try:
y = x["id"]
except:
pass
return y
def breakName(x, y=""):
try:
y = x["name"]
except:
pass
return y
df['brandId'] = df['brand'].map(breakId);
df['brandName'] = df['brand'].map(breakName);
In [9]:
def breakCanC(x,y=""):
try:
y = x[0]["canonicalColors"][0]["name"]
except:
pass
return y
def breakColorName(x, y=""):
try:
y = x[0]["name"]
except:
pass
return y
def breakColorId(x, y=""):
try:
y = x[0]["canonicalColors"][0]["id"]
except:
pass
return y
df['colorId'] = df['colors'].map(breakColorId);
df['colorFamily'] = df['colors'].map(breakCanC);
df['colorNamed'] = df['colors'].map(breakColorName);
In [10]:
df.head()
Out[10]:
Going to save the pertinent columns to a tab separated values (.tsv) file to make it easier to work with data locally. It'll be quicker than waiting for the connection to API.
In [11]:
df.to_csv("tieColors_cleaned.txt", sep='\t', encoding='utf-8',
columns=['id', 'priceLabel', 'name','brandId', 'brandName', 'colorId', 'colorFamily', 'colorNamed'])
In [13]:
import pandas
def openWithPandas(filename):
tdf = pandas.read_table(filename, sep='\t')
return tdf
In [14]:
df = openWithPandas('tieColors_cleaned.txt')
df.dtypes
Out[14]:
In [16]:
bycolor = df.groupby('colorFamily')
byColorSummary = bycolor['priceLabel'].describe()
byColorSummary
Out[16]:
In [17]:
%pylab inline
bycolor = df.groupby('colorFamily')
p1 = bycolor['priceLabel'].mean().order()
plot1 = p1.plot(kind='bar', figsize=(20, 10), title="Average Prices By Color of Ties", color='grey')
plot1.set_ylabel("Average Price ($)")
plot1.set_xlabel("Tie Color")
plt.savefig('colors.png', bbox_inches='tight')
In [18]:
p1
Out[18]:
In [19]:
p2 = bycolor['priceLabel'].agg([np.max, np.mean]).sort('mean')
p2
Out[19]:
In [20]:
plot2 = p2.plot(kind='bar', figsize=(20, 10), title="Price info By Color of Ties")
plot2.set_ylabel("Price ($)")
plot2.set_xlabel("Tie Color")
plt.savefig('color-stats.png')
In [21]:
p2 = bycolor['priceLabel'].agg(['count', np.mean, np.std, np.min, np.max]).sort('count')
p2
Out[21]:
In [22]:
byBrand = df.groupby('brandName')
bb2 = byBrand['priceLabel'].agg(['count', np.mean, np.min, np.max])
In [23]:
# Brands averaging over $250, sorted by their mean. Sorted descending
p4 = bb2[bb2['mean']>=250].sort('mean', ascending=False)
p4
Out[23]:
In [24]:
p4.plot(kind='bar', figsize=(12, 10), title="Prices of Luxury Ties")
Out[24]:
In [38]:
LuxuryBrandIds = [957, 2258, 1452, 3297, 8296, 29961, 14635]
luxuryBrandTies = df[df['brandId'].isin(LuxuryBrandIds)]
luxuryBrandTies
Out[38]:
In [44]:
luxuryColors = luxuryBrandTies.groupby('colorFamily')
lc2 = luxuryColors['priceLabel'].agg(['count', np.mean, np.min, np.max])
lc2
Out[44]:
In [45]:
lc2.plot(kind='bar', figsize=(12, 10), title="Prices")
Out[45]:
In [ ]: