In [75]:
import numpy as np
import pandas as pd
import yaml
import seaborn
import pylab as pl
%pylab inline
In [80]:
with open("./cat_food.txt", 'r') as fi:
food = fi.read().splitlines()
# Example:
# target_cat = ['Restaurants', 'Food'] # to be continued...
target_cat = food
In [81]:
df = pd.read_pickle("./UC01_df_uc_open.p")
print df.shape
df = df[df.categories.apply(lambda x: not set(x).isdisjoint(set(target_cat)))]
print df.shape
For now, start from here...
In [83]:
# load data
# df = pd.read_pickle("./UC01_df_uc_open.p")
df.index = df.business_id.values
att = df.attributes
# extract attributes
a = att.apply(lambda x: yaml.load('['+','.join(x)+']'))
In [84]:
# find full-size attribute set
# if subattribute exists, use '_' to connect them.
att_all = set()
for row in a:
for i in row:
if not isinstance(i.values()[0], dict):
att_all.add(i.keys()[0])
else:
prefix = i.keys()[0]
for k in i.values()[0].iterkeys():
suffix = k
temp = prefix + '_' + suffix
att_all.add(temp)
len(att_all)
Out[84]:
In [86]:
# create full-size attribute table
# index = business_id
# col = att_all
tab = pd.DataFrame(columns=att_all, index=df.index)
for ind in tab.index:
for j in a[ind]:
if not isinstance(j.values()[0], dict):
tab.loc[ind, j.keys()[0]] = j.values()[0]
else:
prefix = j.keys()[0]
for k, v in j.values()[0].iteritems():
suffix = k
temp = prefix + '_' + suffix
tab.loc[ind, temp] = v
In [88]:
print tab.shape[0]
tab.count(axis=0).sort_values(ascending=False)[1:20]
# 729 * 50% = 360 -> 3 attributes -> RestaurantsPriceRange2 / BusinessParking / BikeParking
Out[88]:
In [89]:
print tab.shape[1]
tab.count(axis=1).sort_values(ascending=False)[1:20]
Out[89]:
In [90]:
pl.violinplot(tab.count(axis=1).values,vert=False)
Out[90]:
In [91]:
# sort column by alphabeta
tab.columns = tab.columns.sort_values()
# shape
print df.shape, tab.shape
# join two table
df_with_attribute = df.join(tab)
In [94]:
df_with_attribute.to_pickle("./UC02_df_uc_food_att.p")
Label:
Filter:
Transform:
Useful Attribute:
Not Useful:
Idea from last discussion: