To-Do:

category [list]
attribute
- transform 'attribute' to col
- threshold (value/proportion) -> missing value
hours



In [1]:

    
import seaborn
import pandas as pd
import pylab as pl
import yaml
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
df = pd.read_pickle("../yelp-challenge/data_urbana_champaign/business_urbana_champaign.p")
df.reset_index(drop=True, inplace=True)
print df.shape
print df.columns.values
len(df.business_id.unique())









    



(1556, 16)
[u'address' u'attributes' u'business_id' u'categories' u'city' u'hours'
 u'is_open' u'latitude' u'longitude' u'name' u'neighborhood' u'postal_code'
 u'review_count' u'stars' u'state' u'type']






    Out[2]:





1556

geo: u'address', u'latitude', u'longitude'

No None geo-data
No None name



In [3]:

    
df.head(2)









    Out[3]:






  
    
      
      address
      attributes
      business_id
      categories
      city
      hours
      is_open
      latitude
      longitude
      name
      neighborhood
      postal_code
      review_count
      stars
      state
      type
    
  
  
    
      0
      2501 Fields S Dr
      [BusinessAcceptsCreditCards: True]
      J3EckozG83hEvm267MOeHQ
      [Community Service/Non-Profit, Local Services]
      Champaign
      [Monday 5:0-22:0, Tuesday 5:0-22:0, Wednesday ...
      1
      40.081839
      -88.308425
      YMCA
      
      61820
      7
      3.5
      IL
      business
    
    
      1
      44 E Main St
      None
      3dzPclva9uHp-22TbheqlA
      [Trainers, Pilates, Active Life, Fitness & Ins...
      Champaign
      None
      1
      40.117598
      -88.241400
      Dulak Pilates Center
      
      61820
      4
      5.0
      IL
      business

1. 'city'



In [4]:

    
print df.city.unique(), '\n', len(df.city.unique())









    



[u'Champaign' u'Tuscola' u'Urbana' u'Monticello' u'Savoy' u'Mahomet'
 u'St Joseph' u'Rantoul' u'Philo' u'Tolono' u'Villa Grove' u'Broadlands'
 u'Saint Joseph' u'Gifford' u'Dewey' u'Mansfield' u'Ogden' u'Fithian'
 u'Elk Grove Villa' u'Sidney' u'Homer' u'St. Joseph'] 
22



In [28]:

    
city_count = df.city.groupby(df.city).apply(lambda x:x.count())
uc_sum = city_count[city_count.keys().isin(['Champaign', 'Urbana'])].sum()
100.0 * uc_sum / city_count.sum()









    Out[28]:





87.01799485861183

we only consider Champaign and Urbana as our target in this dataset



In [3]:

    
df_uc = df[df.city.isin(['Champaign', 'Urbana'])]
df_uc.shape









    Out[3]:





(1354, 16)

2. 'is_open'



In [33]:

    
print "{}% ({}/{}) business_id(s) are open".format(100.0 * sum(df_uc.is_open == 1) / len(df_uc), sum(df_uc.is_open == 1), len(df_uc))









    



82.0531757755% (1111/1354) business_id(s) are open

we only consider those with is_open == 1



In [11]:

    
df_uc_open = df_uc[df_uc.is_open == 1]
df_uc_open.reset_index(drop=True, inplace=True)
df_uc_open.shape









    Out[11]:





(1111, 16)

3. 'Hour'



In [112]:

    
100.0 * df_uc_open.hours.dropna().shape[0] / df_uc_open.shape[0]









    Out[112]:





73.26732673267327

Drop all records with missing 'Hour'



In [12]:

    
df_uc_open = df_uc_open.dropna(subset=['hours'])
df_uc_open.reset_index(drop=True, inplace=True)
df_uc_open.shape









    Out[12]:





(814, 16)

4. 'postal_code'



In [116]:

    
len(df_uc_open.postal_code.unique())









    Out[116]:





8

5. 'review_count'



In [117]:

    
df.review_count.describe()









    Out[117]:





count    1556.000000
mean       19.199229
std        37.116487
min         3.000000
25%         4.000000
50%         8.000000
75%        19.000000
max       697.000000
Name: review_count, dtype: float64



In [118]:

    
df_uc_open.review_count.describe()









    Out[118]:





count    814.000000
mean      26.356265
std       48.775645
min        3.000000
25%        5.000000
50%       10.000000
75%       27.000000
max      697.000000
Name: review_count, dtype: float64



In [119]:

    
df_uc_open.ix[df_uc_open.review_count.argmax()]









    Out[119]:





address                                        201 N Broadway Ave
attributes      [Alcohol: full_bar, Ambience: {'romantic': Fal...
business_id                                9MnbQg7kfb_WgxoV0hXKSQ
categories                                [Barbeque, Restaurants]
city                                                       Urbana
hours           [Monday 11:0-22:0, Tuesday 11:0-22:0, Wednesda...
is_open                                                         1
latitude                                                  40.1138
longitude                                                -88.2077
name                                  Black Dog Smoke & Ale House
neighborhood                                                     
postal_code                                                 61801
review_count                                                  697
stars                                                         4.5
state                                                          IL
type                                                     business
Name: 601, dtype: object

6. 'categories'



In [121]:

    
cat = {}
none_num = 0
for i in df_uc_open.categories:
    if not i:
        none_num += 1
        continue
    for j in i:
        cat[j] = cat.get(j,0) + 1



In [122]:

    
none_num









    Out[122]:





0



In [123]:

    
sorted(cat.iteritems(), key=lambda x: x[1], reverse=True)









    Out[123]:





[(u'Restaurants', 256),
 (u'Shopping', 177),
 (u'Food', 125),
 (u'Home Services', 98),
 (u'Beauty & Spas', 87),
 (u'Local Services', 72),
 (u'Automotive', 68),
 (u'Nightlife', 63),
 (u'Event Planning & Services', 55),
 (u'Home & Garden', 54),
 (u'Bars', 54),
 (u'Auto Repair', 50),
 (u'Fashion', 40),
 (u'American (Traditional)', 38),
 (u'Hotels & Travel', 36),
 (u'Real Estate', 35),
 (u'Hair Salons', 35),
 (u'Sandwiches', 34),
 (u'Fast Food', 34),
 (u'Health & Medical', 33),
 (u'Chinese', 29),
 (u'Pizza', 29),
 (u'Mexican', 28),
 (u'Apartments', 27),
 (u'Active Life', 26),
 (u'American (New)', 25),
 (u'Nail Salons', 22),
 (u'Coffee & Tea', 22),
 (u'Contractors', 22),
 (u'Furniture Stores', 22),
 (u'Breakfast & Brunch', 22),
 (u'Pets', 22),
 (u'Bakeries', 22),
 (u'Specialty Food', 21),
 (u'Home Decor', 20),
 (u'Tires', 20),
 (u'Burgers', 19),
 (u'Hotels', 19),
 (u'Grocery', 18),
 (u'Pet Services', 18),
 (u'Auto Parts & Supplies', 17),
 (u'Professional Services', 17),
 (u'Arts & Entertainment', 17),
 (u'Sporting Goods', 17),
 (u'Italian', 17),
 (u'Hair Removal', 16),
 (u'Delis', 16),
 (u'Massage', 15),
 (u'Oil Change Stations', 15),
 (u'Ice Cream & Frozen Yogurt', 15),
 (u'Barbers', 15),
 (u'Venues & Event Spaces', 14),
 (u'Department Stores', 14),
 (u'Property Management', 14),
 (u'Building Supplies', 13),
 (u'Appliances', 13),
 (u'Cosmetics & Beauty Supply', 13),
 (u'Skin Care', 13),
 (u'Hair Stylists', 13),
 (u'Salad', 13),
 (u'Pet Boarding/Pet Sitting', 12),
 (u'Chicken Wings', 12),
 (u'Mattresses', 12),
 (u'Arts & Crafts', 12),
 (u'Flowers & Gifts', 12),
 (u'Mags', 12),
 (u'Desserts', 12),
 (u'Music & Video', 12),
 (u'Books', 12),
 (u'Fitness & Instruction', 11),
 (u'Transportation', 11),
 (u'Day Spas', 11),
 (u'Caterers', 11),
 (u'Wedding Planning', 11),
 (u'Veterinarians', 11),
 (u'Drugstores', 10),
 (u'Hardware Stores', 10),
 (u'Barbeque', 10),
 (u'Waxing', 10),
 (u'Sushi Bars', 10),
 (u'University Housing', 10),
 (u'Thai', 10),
 (u'Pet Groomers', 10),
 (u'Shoe Stores', 10),
 (u'Cafes', 10),
 (u"Women's Clothing", 10),
 (u'Car Dealers', 10),
 (u'Lounges', 9),
 (u'Food Trucks', 9),
 (u'Sports Bars', 9),
 (u'Wine & Spirits', 9),
 (u'Nurseries & Gardening', 9),
 (u'Party & Event Planning', 9),
 (u'Asian Fusion', 9),
 (u'Massage Therapy', 9),
 (u'Beer', 9),
 (u'Florists', 9),
 (u'Gyms', 8),
 (u'Diners', 8),
 (u'Home Cleaning', 8),
 (u'Movers', 8),
 (u"Men's Clothing", 8),
 (u'Auto Detailing', 8),
 (u'Printing Services', 8),
 (u'Heating & Air Conditioning/HVAC', 8),
 (u'Food Delivery Services', 8),
 (u'Towing', 8),
 (u'Steakhouses', 8),
 (u'Dentists', 8),
 (u'Real Estate Services', 8),
 (u'Convenience Stores', 8),
 (u'Pubs', 8),
 (u'Electronics', 8),
 (u'Japanese', 7),
 (u'Optometrists', 7),
 (u'Body Shops', 7),
 (u'Tex-Mex', 7),
 (u'Plumbing', 7),
 (u'IT Services & Computer Repair', 7),
 (u'Airport Shuttles', 7),
 (u'Doctors', 7),
 (u'Pet Stores', 7),
 (u'General Dentistry', 7),
 (u'Photography Stores & Services', 7),
 (u'Dry Cleaning & Laundry', 6),
 (u'Juice Bars & Smoothies', 6),
 (u'Bikes', 6),
 (u'Yoga', 6),
 (u'Eyewear & Opticians', 6),
 (u'Trainers', 6),
 (u'Taxis', 6),
 (u'Ethnic Food', 6),
 (u'Sewing & Alterations', 6),
 (u'Bagels', 6),
 (u'Mobile Phones', 6),
 (u'Thrift Stores', 6),
 (u'Jewelry', 6),
 (u'Sports Wear', 6),
 (u'Korean', 6),
 (u'Financial Services', 6),
 (u'Tanning', 5),
 (u"Men's Hair Salons", 5),
 (u'Hobby Shops', 5),
 (u'Art Galleries', 5),
 (u'Limos', 5),
 (u'Tobacco Shops', 5),
 (u'Music Venues', 5),
 (u'Accessories', 5),
 (u'Car Wash', 5),
 (u'Appliances & Repair', 5),
 (u'Used', 5),
 (u'Indian', 5),
 (u'Bubble Tea', 5),
 (u'Bookstores', 5),
 (u'Self Storage', 5),
 (u'Vintage & Consignment', 5),
 (u'Bridal', 5),
 (u'Baby Gear & Furniture', 5),
 (u'Eyelash Service', 5),
 (u'Roofing', 4),
 (u'Antiques', 4),
 (u'Interior Design', 4),
 (u'Handyman', 4),
 (u'Vegetarian', 4),
 (u'Piercing', 4),
 (u'Seafood', 4),
 (u'Discount Store', 4),
 (u'Music & DVDs', 4),
 (u'Pet Training', 4),
 (u'Oral Surgeons', 4),
 (u'Pool Halls', 4),
 (u'Vape Shops', 4),
 (u'Photographers', 4),
 (u'Tree Services', 4),
 (u'Chocolatiers & Shops', 4),
 (u'Art Supplies', 4),
 (u'Recycling Center', 4),
 (u'Dance Clubs', 4),
 (u'Makeup Artists', 4),
 (u'Mobile Phone Repair', 4),
 (u'Wine Bars', 4),
 (u'Shipping Centers', 4),
 (u'Local Flavor', 4),
 (u'Soup', 4),
 (u'Notaries', 4),
 (u'Jewelry Repair', 4),
 (u'Karaoke', 4),
 (u'Musical Instruments & Teachers', 4),
 (u'Kitchen & Bath', 4),
 (u'Health Markets', 4),
 (u'Cosmetic Dentists', 4),
 (u'Tattoo', 4),
 (u'Electronics Repair', 4),
 (u'Carpet Cleaning', 4),
 (u'Landscaping', 4),
 (u'Real Estate Agents', 4),
 (u'Auto Glass Services', 4),
 (u'Truck Rental', 3),
 (u'Creperies', 3),
 (u'Formal Wear', 3),
 (u'Fabric Stores', 3),
 (u'Outdoor Gear', 3),
 (u'Orthodontists', 3),
 (u'Shoe Repair', 3),
 (u'Cajun/Creole', 3),
 (u'Greek', 3),
 (u'Ophthalmologists', 3),
 (u'Couriers & Delivery Services', 3),
 (u'Office Cleaning', 3),
 (u'Shaved Ice', 3),
 (u'Chiropractors', 3),
 (u'Dive Bars', 3),
 (u'Acupuncture', 3),
 (u'Damage Restoration', 3),
 (u'Szechuan', 3),
 (u'Keys & Locksmiths', 3),
 (u'Recreation Centers', 3),
 (u'Computers', 3),
 (u'Pest Control', 3),
 (u'Comfort Food', 3),
 (u'Hair Extensions', 3),
 (u'Session Photography', 3),
 (u'Mediterranean', 3),
 (u'Education', 3),
 (u'Insurance', 3),
 (u'Gutter Services', 3),
 (u'Internet Service Providers', 3),
 (u'Car Rental', 3),
 (u'Toy Stores', 3),
 (u'Imported Food', 3),
 (u'Banks & Credit Unions', 3),
 (u'Security Systems', 3),
 (u'Vegan', 3),
 (u'Windshield Installation & Repair', 3),
 (u'Snow Removal', 3),
 (u'Cheese Shops', 3),
 (u'Bike Repair/Maintenance', 3),
 (u'Medical Supplies', 3),
 (u'Buffets', 2),
 (u'Auto Insurance', 2),
 (u'Community Service/Non-Profit', 2),
 (u'Herbs & Spices', 2),
 (u'Noodles', 2),
 (u'Watch Repair', 2),
 (u'Empanadas', 2),
 (u'Libraries', 2),
 (u'Taiwanese', 2),
 (u'Vietnamese', 2),
 (u'Video Game Stores', 2),
 (u'Air Duct Cleaning', 2),
 (u'Transmission Repair', 2),
 (u'Blow Dry/Out Services', 2),
 (u'Beer Gardens', 2),
 (u'Hot Dogs', 2),
 (u'Pediatric Dentists', 2),
 (u'Laundry Services', 2),
 (u'Watches', 2),
 (u'Lawyers', 2),
 (u'Tapas Bars', 2),
 (u'Public Transportation', 2),
 (u'Cupcakes', 2),
 (u'Martial Arts', 2),
 (u'Television Service Providers', 2),
 (u'Knitting Supplies', 2),
 (u'Arcades', 2),
 (u'Escape Games', 2),
 (u'Gluten-Free', 2),
 (u'Gelato', 2),
 (u'Motorcycle Dealers', 2),
 (u'Medical Centers', 2),
 (u'Do-It-Yourself Food', 2),
 (u'Junk Removal & Hauling', 2),
 (u'Gas & Service Stations', 2),
 (u'Fish & Chips', 2),
 (u'Farmers Market', 2),
 (u'Beer Bar', 2),
 (u'Tours', 2),
 (u'Commercial Real Estate', 2),
 (u'Guns & Ammo', 2),
 (u'Southern', 2),
 (u'Candy Stores', 2),
 (u'Vinyl Records', 2),
 (u'Donuts', 2),
 (u'Painters', 2),
 (u'Counseling & Mental Health', 2),
 (u'Motorcycle Repair', 2),
 (u'Street Vendors', 2),
 (u'Signmaking', 2),
 (u'Party Supplies', 2),
 (u'Public Services & Government', 2),
 (u'Carpeting', 2),
 (u'Plus Size Fashion', 2),
 (u'Event Photography', 2),
 (u'Adult Entertainment', 2),
 (u'Home & Rental Insurance', 2),
 (u'Framing', 2),
 (u'Graphic Design', 2),
 (u'Parking', 2),
 (u'Videos & Video Game Rental', 2),
 (u'Gardeners', 2),
 (u'Windows Installation', 2),
 (u'Meat Shops', 2),
 (u'Endodontists', 1),
 (u'Organic Stores', 1),
 (u'Architects', 1),
 (u'Bowling', 1),
 (u'Vinyl Siding', 1),
 (u'Vitamins & Supplements', 1),
 (u'Real Estate Law', 1),
 (u'Laser Eye Surgery/Lasik', 1),
 (u'Diagnostic Services', 1),
 (u'Food Stands', 1),
 (u'Shopping Centers', 1),
 (u'Mountain Biking', 1),
 (u'Elementary Schools', 1),
 (u'Animal Shelters', 1),
 (u'Pool & Billiards', 1),
 (u'Zoos', 1),
 (u'Hookah Bars', 1),
 (u'Middle Eastern', 1),
 (u'Trusts', 1),
 (u'Landscape Architects', 1),
 (u'Divorce & Family Law', 1),
 (u'Costumes', 1),
 (u'Souvenir Shops', 1),
 (u'Used Bookstore', 1),
 (u'International Grocery', 1),
 (u'Kosher', 1),
 (u'Climbing', 1),
 (u'Basque', 1),
 (u'Electricians', 1),
 (u'Furniture Reupholstery', 1),
 (u'Web Design', 1),
 (u'Screen Printing', 1),
 (u'Buses', 1),
 (u'Wills', 1),
 (u'Comic Books', 1),
 (u'Dance Studios', 1),
 (u'Shades & Blinds', 1),
 (u'Fitness/Exercise Equipment', 1),
 (u'Cocktail Bars', 1),
 (u'Mortgage Brokers', 1),
 (u'Batting Cages', 1),
 (u'Shutters', 1),
 (u'Window Washing', 1),
 (u'Estate Planning Law', 1),
 (u'Mobile Phone Accessories', 1),
 (u'Dog Walkers', 1),
 (u'CSA', 1),
 (u'Museums', 1),
 (u'Performing Arts', 1),
 (u'Office Equipment', 1),
 (u'Naturopathic/Holistic', 1),
 (u'Smokehouse', 1),
 (u'Garage Door Services', 1),
 (u'& Probates', 1),
 (u'Outlet Stores', 1),
 (u'Trampoline Parks', 1),
 (u'Aquariums', 1),
 (u'Trailer Repair', 1),
 (u'Embroidery & Crochet', 1),
 (u'Jazz & Blues', 1),
 (u'Reflexology', 1),
 (u'Lingerie', 1),
 (u'Criminal Defense Law', 1),
 (u'Motorcycle Gear', 1),
 (u'Health Retreats', 1),
 (u'Newspapers & Magazines', 1),
 (u'Irish', 1),
 (u'Amusement Parks', 1),
 (u'Door Sales/Installation', 1),
 (u'Spray Tanning', 1),
 (u'Specialty Schools', 1),
 (u'Fertility', 1),
 (u'Personal Assistants', 1),
 (u'Personal Chefs', 1),
 (u'Caribbean', 1),
 (u'Roadside Assistance', 1),
 (u'Airports', 1),
 (u"Children's Clothing", 1),
 (u'Utilities', 1),
 (u'Argentine', 1),
 (u'Screen Printing/T-Shirt Printing', 1),
 (u'Middle Schools & High Schools', 1),
 (u'Cinema', 1),
 (u'Gun/Rifle Ranges', 1),
 (u'Gay Bars', 1),
 (u'Religious Organizations', 1),
 (u'Physical Therapy', 1),
 (u'Hot Pot', 1),
 (u'Casinos', 1),
 (u'Cards & Stationery', 1),
 (u'Irrigation', 1),
 (u'Mongolian', 1),
 (u'Life Insurance', 1),
 (u'Carpenters', 1),
 (u'RV Repair', 1),
 (u'Luggage', 1),
 (u'Masonry/Concrete', 1),
 (u'Appraisal Services', 1),
 (u'Chimney Sweeps', 1),
 (u'Wholesale Stores', 1),
 (u'Arabian', 1),
 (u'Churches', 1),
 (u'Wigs', 1),
 (u'Skating Rinks', 1),
 (u'Brewing Supplies', 1),
 (u'Immigration Law', 1),
 (u'Obstetricians & Gynecologists', 1),
 (u'Musical Instrument Services', 1),
 (u'Laboratory Testing', 1),
 (u'Train Stations', 1),
 (u'Bike Rentals', 1),
 (u'Wineries', 1),
 (u'Lighting Fixtures & Equipment', 1),
 (u'Wheel & Rim Repair', 1),
 (u'Unofficial Yelp Events', 1),
 (u'Cabinetry', 1),
 (u'Periodontists', 1),
 (u'Pakistani', 1),
 (u'Piano Stores', 1),
 (u'Mini Golf', 1),
 (u'Car Stereo Installation', 1),
 (u'Breweries', 1),
 (u'Malaysian', 1),
 (u'Custom Cakes', 1),
 (u'DUI Law', 1),
 (u'Golf', 1),
 (u'Seafood Markets', 1),
 (u'Flooring', 1),
 (u'Parks', 1),
 (u'Glass & Mirrors', 1),
 (u'Home Staging', 1),
 (u'Olive Oil', 1),
 (u'Coffee Roasteries', 1),
 (u'Ticket Sales', 1),
 (u'Traditional Chinese Medicine', 1),
 (u'Spanish', 1)]

Here we should set the list of categories we are interested.



In [ ]:

7. 'Neighborhood'



In [124]:

    
df_uc_open.neighborhood.unique()









    Out[124]:





array([u''], dtype=object)

All 'Neighborhood' == None

8. 'Attribute'

First of all, remove records with None 'Attribute'



In [13]:

    
df_uc_open = df_uc_open.dropna(subset=['attributes'])
df_uc_open.reset_index(drop=True, inplace=True)
df_uc_open.shape
# 814 -> ?









    Out[13]:





(729, 16)



In [204]:

    
# df_uc_open.attributes
# import yaml



In [202]:

    
att_count = {}
for i in df_uc_open.attributes:
    data = yaml.load('['+','.join(i)+']')
    for d in data:
        att_count[d.keys()[0]] = att_count.get(d.keys()[0], 0) + 1



In [203]:

    
sorted(att_count.iteritems(), key=lambda x: x[1], reverse=True)









    Out[203]:





[('BusinessAcceptsCreditCards', 690),
 ('RestaurantsPriceRange2', 553),
 ('BusinessParking', 507),
 ('BikeParking', 454),
 ('WheelchairAccessible', 329),
 ('GoodForKids', 300),
 ('RestaurantsTakeOut', 292),
 ('OutdoorSeating', 274),
 ('RestaurantsGoodForGroups', 264),
 ('RestaurantsDelivery', 261),
 ('Alcohol', 257),
 ('RestaurantsReservations', 255),
 ('WiFi', 253),
 ('HasTV', 252),
 ('Ambience', 250),
 ('NoiseLevel', 242),
 ('RestaurantsAttire', 240),
 ('GoodForMeal', 240),
 ('RestaurantsTableService', 223),
 ('Caters', 202),
 ('ByAppointmentOnly', 169),
 ('BusinessAcceptsBitcoin', 68),
 ('GoodForDancing', 57),
 ('DogsAllowed', 56),
 ('CoatCheck', 55),
 ('HappyHour', 54),
 ('Music', 52),
 ('Smoking', 49),
 ('BestNights', 46),
 ('BYOBCorkage', 39),
 ('DriveThru', 35),
 ('BYOB', 31),
 ('AcceptsInsurance', 22),
 ('Corkage', 19),
 ('RestaurantsCounterService', 12),
 ('HairSpecializesIn', 10),
 ('Open24Hours', 3),
 ('AgesAllowed', 3),
 ('DietaryRestrictions', 2)]



In [207]:

    
df_uc_open.shape[0]









    Out[207]:





729

We need a threshold to decide which attributes we use to build model.



In [212]:

    
data = yaml.load('['+','.join(df_uc_open.attributes[452])+']')
data









    Out[212]:





[{'Alcohol': 'full_bar'},
 {'Ambience': {'casual': False,
   'classy': False,
   'divey': False,
   'hipster': False,
   'intimate': False,
   'romantic': False,
   'touristy': False,
   'trendy': False,
   'upscale': False}},
 {'BusinessAcceptsCreditCards': True},
 {'NoiseLevel': 'average'},
 {'RestaurantsAttire': 'dressy'},
 {'RestaurantsGoodForGroups': True},
 {'RestaurantsPriceRange2': 3},
 {'RestaurantsReservations': True},
 {'RestaurantsTableService': True},
 {'BikeParking': True},
 {'GoodForKids': False},
 {'OutdoorSeating': True},
 {'BusinessParking': {'garage': False,
   'lot': False,
   'street': True,
   'valet': False,
   'validated': False}},
 {'HasTV': True},
 {'RestaurantsTakeOut': True},
 {'RestaurantsDelivery': True},
 {'Caters': True},
 {'WheelchairAccessible': True},
 {'WiFi': 'free'},
 {'GoodForMeal': {'breakfast': False,
   'brunch': False,
   'dessert': False,
   'dinner': True,
   'latenight': False,
   'lunch': False}},
 {'BestNights': {'friday': False,
   'monday': False,
   'saturday': False,
   'sunday': False,
   'thursday': True,
   'tuesday': True,
   'wednesday': True}},
 {'CoatCheck': True},
 {'GoodForDancing': False},
 {'Music': {'background_music': True,
   'dj': False,
   'jukebox': False,
   'karaoke': False,
   'live': False,
   'no_music': False,
   'video': False}},
 {'Smoking': False}]

There are 2 kinds of attributes:

* Binary 
    * 0 or 1
    * True or False
* Multiple
    * dict eg. 'GoodForMeal'
        * if ALL False, ?
        * if more than ONE True, ? eg. 'BestNights'
    * multi-value eg. 'RestaurantsPriceRange2' 1,2,3... or 'Alcohol'
* Anything Else


* How to deal with missing values?

9. 'Stars'



In [252]:

    
pl.hist(df_uc_open.stars)









    Out[252]:





(array([  12.,   19.,   32.,   55.,    0.,  127.,  162.,  138.,   94.,   90.]),
 array([ 1. ,  1.4,  1.8,  2.2,  2.6,  3. ,  3.4,  3.8,  4.2,  4.6,  5. ]),
 <a list of 10 Patch objects>)



In [228]:

    
df_uc_open.stars.describe()









    Out[228]:





count    729.000000
mean       3.587106
std        0.934106
min        1.000000
25%        3.000000
50%        3.500000
75%        4.500000
max        5.000000
Name: stars, dtype: float64



In [242]:

    
s = 3.9
1.0 * sum(df_uc_open.stars > s) / 729









    Out[242]:





0.44170096021947874

If we would like to build a binary classification model, here we could set threshold 'star' as '<4.0'



In [241]:

    
df_uc_open.stars.value_counts() /729









    Out[241]:





3.5    0.222222
4.0    0.189300
3.0    0.174211
4.5    0.128944
5.0    0.123457
2.5    0.075446
2.0    0.043896
1.5    0.026063
1.0    0.016461
Name: stars, dtype: float64



In [275]:

    
pl.hist?



In [276]:

    
c = ['Champaign', 'Urbana']
print c[0], '\n',df_uc_open[df_uc_open.city == c[0]].stars.describe()
pl.hist(df_uc_open[df_uc_open.city == c[0]].stars, normed=True)









    



Champaign 
count    559.000000
mean       3.567084
std        0.943507
min        1.000000
25%        3.000000
50%        3.500000
75%        4.000000
max        5.000000
Name: stars, dtype: float64






    Out[276]:





(array([ 0.04472272,  0.07155635,  0.1118068 ,  0.18783542,  0.        ,
         0.45169946,  0.5411449 ,  0.48300537,  0.29964222,  0.30858676]),
 array([ 1. ,  1.4,  1.8,  2.2,  2.6,  3. ,  3.4,  3.8,  4.2,  4.6,  5. ]),
 <a list of 10 Patch objects>)



In [277]:

    
print c[1], '\n',df_uc_open[df_uc_open.city == c[1]].stars.describe()
pl.hist(df_uc_open[df_uc_open.city == c[1]].stars, normed=True)









    



Urbana 
count    170.000000
mean       3.652941
std        0.902080
min        1.000000
25%        3.000000
50%        3.500000
75%        4.500000
max        5.000000
Name: stars, dtype: float64






    Out[277]:





(array([ 0.02941176,  0.04411765,  0.10294118,  0.19117647,  0.        ,
         0.38235294,  0.60294118,  0.44117647,  0.39705882,  0.30882353]),
 array([ 1. ,  1.4,  1.8,  2.2,  2.6,  3. ,  3.4,  3.8,  4.2,  4.6,  5. ]),
 <a list of 10 Patch objects>)

output df_uc_open



In [253]:

    
print df.shape, df_uc.shape, df_uc_open.shape









    



(1556, 16) (1354, 16) (729, 16)



In [15]:

    
df_uc_open.to_pickle("UC01_df_uc_open.p")

	address	attributes	business_id	categories	city	hours	is_open	latitude	longitude	name	neighborhood	postal_code	review_count	stars	state	type
0	2501 Fields S Dr	[BusinessAcceptsCreditCards: True]	J3EckozG83hEvm267MOeHQ	[Community Service/Non-Profit, Local Services]	Champaign	[Monday 5:0-22:0, Tuesday 5:0-22:0, Wednesday ...	1	40.081839	-88.308425	YMCA		61820	7	3.5	IL	business
1	44 E Main St	None	3dzPclva9uHp-22TbheqlA	[Trainers, Pilates, Active Life, Fitness & Ins...	Champaign	None	1	40.117598	-88.241400	Dulak Pilates Center		61820	4	5.0	IL	business