In [75]:

    
import numpy as np
import pandas as pd
import yaml
import seaborn
import pylab as pl
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib

Part 1 - Filter Categories



In [80]:

    
with open("./cat_food.txt", 'r') as fi:
    food = fi.read().splitlines()       
# Example: 
# target_cat = ['Restaurants', 'Food'] # to be continued...
target_cat = food



In [81]:

    
df = pd.read_pickle("./UC01_df_uc_open.p")
print df.shape
df = df[df.categories.apply(lambda x: not set(x).isdisjoint(set(target_cat)))]
print df.shape









    



(729, 16)
(334, 16)

For now, start from here...

Part 2 - Build Attribute Table



In [83]:

    
# load data
# df = pd.read_pickle("./UC01_df_uc_open.p") 

df.index = df.business_id.values
att = df.attributes

# extract attributes
a = att.apply(lambda x: yaml.load('['+','.join(x)+']'))



In [84]:

    
# find full-size attribute set
# if subattribute exists, use '_' to connect them.
att_all = set()
for row in a:
    for i in row:
        if not isinstance(i.values()[0], dict):
            att_all.add(i.keys()[0])
        else:
            prefix = i.keys()[0]
            for k in i.values()[0].iterkeys():
                suffix = k
                temp = prefix + '_' + suffix
                att_all.add(temp)
len(att_all)









    Out[84]:





73



In [86]:

    
# create full-size attribute table
# index = business_id
# col = att_all
tab = pd.DataFrame(columns=att_all, index=df.index)

for ind in tab.index:
    for j in a[ind]:
        if not isinstance(j.values()[0], dict):
            tab.loc[ind, j.keys()[0]] = j.values()[0]
        else:
            prefix = j.keys()[0]
            for k, v in j.values()[0].iteritems():
                suffix = k
                temp = prefix + '_' + suffix
                tab.loc[ind, temp] = v

Part 3 - Missing Values

(missing, not False)



In [88]:

    
print tab.shape[0]
tab.count(axis=0).sort_values(ascending=False)[1:20]
# 729 * 50% = 360 -> 3 attributes -> RestaurantsPriceRange2 / BusinessParking / BikeParking









    



334






    Out[88]:





RestaurantsPriceRange2       330
BusinessParking_lot          304
BusinessParking_valet        304
BusinessParking_validated    304
BusinessParking_garage       304
BusinessParking_street       304
BikeParking                  296
RestaurantsTakeOut           292
OutdoorSeating               271
RestaurantsGoodForGroups     261
RestaurantsReservations      255
Alcohol                      254
RestaurantsDelivery          253
HasTV                        250
Ambience_touristy            248
Ambience_trendy              248
Ambience_upscale             248
Ambience_casual              248
Ambience_hipster             248
dtype: int64



In [89]:

    
print tab.shape[1]
tab.count(axis=1).sort_values(ascending=False)[1:20]









    



73






    Out[89]:





L2c-qKZWumCmOCR-dqBLrg    58
jeTfL2kCyBtmFGSrSQHqVw    58
UUsS7SrTQQ2AvvpslfrvFA    57
-Jhlh8Scjy669NdtCfKSSg    57
D6SCh4BwNb52wZIqXdS4JQ    57
ODURDhURWmZr6b--O4nt3w    56
7B-suS0RHxqxZBBfVfiPLA    56
dn9lwYUxmhs_mLKPu7L25Q    56
S_9OaE_RBWBIXnUKvxJKsQ    56
iZleRh4SOtf4GquA5L8WXw    56
uvVOuCbo4_5z_J5WJVG4QQ    56
RQxLNvAra_MUUPHdhukilA    55
F1qPjasn0R6-j8sa6iYNmA    55
vqEtZ7CtNB9G8WMzjwJA2g    55
JbnNk6do3aHF3Kr1bX4HzQ    55
mklo4H47YWK7f18YXTCEoA    55
HbK6IfznbVToEaKVC0WcSg    55
_TWp4gqGF7tQUGloTClcNg    55
PmVxbit6HDDsEUS-j9aDfg    54
dtype: int64



In [90]:

    
pl.violinplot(tab.count(axis=1).values,vert=False)









    Out[90]:





{u'bodies': [<matplotlib.collections.PolyCollection at 0x1102a1590>],
 u'cbars': <matplotlib.collections.LineCollection at 0x11107fe50>,
 u'cmaxes': <matplotlib.collections.LineCollection at 0x11037e350>,
 u'cmins': <matplotlib.collections.LineCollection at 0x11005b7d0>}

Part 4 - Join & output



In [91]:

    
# sort column by alphabeta
tab.columns = tab.columns.sort_values()
# shape
print df.shape, tab.shape
# join two table
df_with_attribute = df.join(tab)









    



(334, 16) (334, 73)



In [94]:

    
df_with_attribute.to_pickle("./UC02_df_uc_food_att.p")

Summary

Label:

stars

Filter:

city - u & c
is_open == 1
categories <- target category list

Transform:

attributes

Useful Attribute:

From attributes
review_count
hours - not preprocess
lat/long, address, postal_code
(maybe) name - text?

Not Useful:

state - all identical
type - all identical
neighborhood - all nan

Idea from last discussion:

make full use of remaining categories