To-Do:

  • category [list]
  • attribute
    • transform 'attribute' to col
    • threshold (value/proportion) -> missing value
  • hours

In [1]:
import seaborn
import pandas as pd
import pylab as pl
import yaml
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
df = pd.read_pickle("../yelp-challenge/data_urbana_champaign/business_urbana_champaign.p")
df.reset_index(drop=True, inplace=True)
print df.shape
print df.columns.values
len(df.business_id.unique())


(1556, 16)
[u'address' u'attributes' u'business_id' u'categories' u'city' u'hours'
 u'is_open' u'latitude' u'longitude' u'name' u'neighborhood' u'postal_code'
 u'review_count' u'stars' u'state' u'type']
Out[2]:
1556

geo: u'address', u'latitude', u'longitude'

No None geo-data
No None name


In [3]:
df.head(2)


Out[3]:
address attributes business_id categories city hours is_open latitude longitude name neighborhood postal_code review_count stars state type
0 2501 Fields S Dr [BusinessAcceptsCreditCards: True] J3EckozG83hEvm267MOeHQ [Community Service/Non-Profit, Local Services] Champaign [Monday 5:0-22:0, Tuesday 5:0-22:0, Wednesday ... 1 40.081839 -88.308425 YMCA 61820 7 3.5 IL business
1 44 E Main St None 3dzPclva9uHp-22TbheqlA [Trainers, Pilates, Active Life, Fitness & Ins... Champaign None 1 40.117598 -88.241400 Dulak Pilates Center 61820 4 5.0 IL business

1. 'city'


In [4]:
print df.city.unique(), '\n', len(df.city.unique())


[u'Champaign' u'Tuscola' u'Urbana' u'Monticello' u'Savoy' u'Mahomet'
 u'St Joseph' u'Rantoul' u'Philo' u'Tolono' u'Villa Grove' u'Broadlands'
 u'Saint Joseph' u'Gifford' u'Dewey' u'Mansfield' u'Ogden' u'Fithian'
 u'Elk Grove Villa' u'Sidney' u'Homer' u'St. Joseph'] 
22

In [28]:
city_count = df.city.groupby(df.city).apply(lambda x:x.count())
uc_sum = city_count[city_count.keys().isin(['Champaign', 'Urbana'])].sum()
100.0 * uc_sum / city_count.sum()


Out[28]:
87.01799485861183

we only consider Champaign and Urbana as our target in this dataset


In [3]:
df_uc = df[df.city.isin(['Champaign', 'Urbana'])]
df_uc.shape


Out[3]:
(1354, 16)

2. 'is_open'


In [33]:
print "{}% ({}/{}) business_id(s) are open".format(100.0 * sum(df_uc.is_open == 1) / len(df_uc), sum(df_uc.is_open == 1), len(df_uc))


82.0531757755% (1111/1354) business_id(s) are open

we only consider those with is_open == 1


In [11]:
df_uc_open = df_uc[df_uc.is_open == 1]
df_uc_open.reset_index(drop=True, inplace=True)
df_uc_open.shape


Out[11]:
(1111, 16)

3. 'Hour'


In [112]:
100.0 * df_uc_open.hours.dropna().shape[0] / df_uc_open.shape[0]


Out[112]:
73.26732673267327

Drop all records with missing 'Hour'


In [12]:
df_uc_open = df_uc_open.dropna(subset=['hours'])
df_uc_open.reset_index(drop=True, inplace=True)
df_uc_open.shape


Out[12]:
(814, 16)

4. 'postal_code'


In [116]:
len(df_uc_open.postal_code.unique())


Out[116]:
8

5. 'review_count'


In [117]:
df.review_count.describe()


Out[117]:
count    1556.000000
mean       19.199229
std        37.116487
min         3.000000
25%         4.000000
50%         8.000000
75%        19.000000
max       697.000000
Name: review_count, dtype: float64

In [118]:
df_uc_open.review_count.describe()


Out[118]:
count    814.000000
mean      26.356265
std       48.775645
min        3.000000
25%        5.000000
50%       10.000000
75%       27.000000
max      697.000000
Name: review_count, dtype: float64

In [119]:
df_uc_open.ix[df_uc_open.review_count.argmax()]


Out[119]:
address                                        201 N Broadway Ave
attributes      [Alcohol: full_bar, Ambience: {'romantic': Fal...
business_id                                9MnbQg7kfb_WgxoV0hXKSQ
categories                                [Barbeque, Restaurants]
city                                                       Urbana
hours           [Monday 11:0-22:0, Tuesday 11:0-22:0, Wednesda...
is_open                                                         1
latitude                                                  40.1138
longitude                                                -88.2077
name                                  Black Dog Smoke & Ale House
neighborhood                                                     
postal_code                                                 61801
review_count                                                  697
stars                                                         4.5
state                                                          IL
type                                                     business
Name: 601, dtype: object

6. 'categories'


In [121]:
cat = {}
none_num = 0
for i in df_uc_open.categories:
    if not i:
        none_num += 1
        continue
    for j in i:
        cat[j] = cat.get(j,0) + 1

In [122]:
none_num


Out[122]:
0

In [123]:
sorted(cat.iteritems(), key=lambda x: x[1], reverse=True)


Out[123]:
[(u'Restaurants', 256),
 (u'Shopping', 177),
 (u'Food', 125),
 (u'Home Services', 98),
 (u'Beauty & Spas', 87),
 (u'Local Services', 72),
 (u'Automotive', 68),
 (u'Nightlife', 63),
 (u'Event Planning & Services', 55),
 (u'Home & Garden', 54),
 (u'Bars', 54),
 (u'Auto Repair', 50),
 (u'Fashion', 40),
 (u'American (Traditional)', 38),
 (u'Hotels & Travel', 36),
 (u'Real Estate', 35),
 (u'Hair Salons', 35),
 (u'Sandwiches', 34),
 (u'Fast Food', 34),
 (u'Health & Medical', 33),
 (u'Chinese', 29),
 (u'Pizza', 29),
 (u'Mexican', 28),
 (u'Apartments', 27),
 (u'Active Life', 26),
 (u'American (New)', 25),
 (u'Nail Salons', 22),
 (u'Coffee & Tea', 22),
 (u'Contractors', 22),
 (u'Furniture Stores', 22),
 (u'Breakfast & Brunch', 22),
 (u'Pets', 22),
 (u'Bakeries', 22),
 (u'Specialty Food', 21),
 (u'Home Decor', 20),
 (u'Tires', 20),
 (u'Burgers', 19),
 (u'Hotels', 19),
 (u'Grocery', 18),
 (u'Pet Services', 18),
 (u'Auto Parts & Supplies', 17),
 (u'Professional Services', 17),
 (u'Arts & Entertainment', 17),
 (u'Sporting Goods', 17),
 (u'Italian', 17),
 (u'Hair Removal', 16),
 (u'Delis', 16),
 (u'Massage', 15),
 (u'Oil Change Stations', 15),
 (u'Ice Cream & Frozen Yogurt', 15),
 (u'Barbers', 15),
 (u'Venues & Event Spaces', 14),
 (u'Department Stores', 14),
 (u'Property Management', 14),
 (u'Building Supplies', 13),
 (u'Appliances', 13),
 (u'Cosmetics & Beauty Supply', 13),
 (u'Skin Care', 13),
 (u'Hair Stylists', 13),
 (u'Salad', 13),
 (u'Pet Boarding/Pet Sitting', 12),
 (u'Chicken Wings', 12),
 (u'Mattresses', 12),
 (u'Arts & Crafts', 12),
 (u'Flowers & Gifts', 12),
 (u'Mags', 12),
 (u'Desserts', 12),
 (u'Music & Video', 12),
 (u'Books', 12),
 (u'Fitness & Instruction', 11),
 (u'Transportation', 11),
 (u'Day Spas', 11),
 (u'Caterers', 11),
 (u'Wedding Planning', 11),
 (u'Veterinarians', 11),
 (u'Drugstores', 10),
 (u'Hardware Stores', 10),
 (u'Barbeque', 10),
 (u'Waxing', 10),
 (u'Sushi Bars', 10),
 (u'University Housing', 10),
 (u'Thai', 10),
 (u'Pet Groomers', 10),
 (u'Shoe Stores', 10),
 (u'Cafes', 10),
 (u"Women's Clothing", 10),
 (u'Car Dealers', 10),
 (u'Lounges', 9),
 (u'Food Trucks', 9),
 (u'Sports Bars', 9),
 (u'Wine & Spirits', 9),
 (u'Nurseries & Gardening', 9),
 (u'Party & Event Planning', 9),
 (u'Asian Fusion', 9),
 (u'Massage Therapy', 9),
 (u'Beer', 9),
 (u'Florists', 9),
 (u'Gyms', 8),
 (u'Diners', 8),
 (u'Home Cleaning', 8),
 (u'Movers', 8),
 (u"Men's Clothing", 8),
 (u'Auto Detailing', 8),
 (u'Printing Services', 8),
 (u'Heating & Air Conditioning/HVAC', 8),
 (u'Food Delivery Services', 8),
 (u'Towing', 8),
 (u'Steakhouses', 8),
 (u'Dentists', 8),
 (u'Real Estate Services', 8),
 (u'Convenience Stores', 8),
 (u'Pubs', 8),
 (u'Electronics', 8),
 (u'Japanese', 7),
 (u'Optometrists', 7),
 (u'Body Shops', 7),
 (u'Tex-Mex', 7),
 (u'Plumbing', 7),
 (u'IT Services & Computer Repair', 7),
 (u'Airport Shuttles', 7),
 (u'Doctors', 7),
 (u'Pet Stores', 7),
 (u'General Dentistry', 7),
 (u'Photography Stores & Services', 7),
 (u'Dry Cleaning & Laundry', 6),
 (u'Juice Bars & Smoothies', 6),
 (u'Bikes', 6),
 (u'Yoga', 6),
 (u'Eyewear & Opticians', 6),
 (u'Trainers', 6),
 (u'Taxis', 6),
 (u'Ethnic Food', 6),
 (u'Sewing & Alterations', 6),
 (u'Bagels', 6),
 (u'Mobile Phones', 6),
 (u'Thrift Stores', 6),
 (u'Jewelry', 6),
 (u'Sports Wear', 6),
 (u'Korean', 6),
 (u'Financial Services', 6),
 (u'Tanning', 5),
 (u"Men's Hair Salons", 5),
 (u'Hobby Shops', 5),
 (u'Art Galleries', 5),
 (u'Limos', 5),
 (u'Tobacco Shops', 5),
 (u'Music Venues', 5),
 (u'Accessories', 5),
 (u'Car Wash', 5),
 (u'Appliances & Repair', 5),
 (u'Used', 5),
 (u'Indian', 5),
 (u'Bubble Tea', 5),
 (u'Bookstores', 5),
 (u'Self Storage', 5),
 (u'Vintage & Consignment', 5),
 (u'Bridal', 5),
 (u'Baby Gear & Furniture', 5),
 (u'Eyelash Service', 5),
 (u'Roofing', 4),
 (u'Antiques', 4),
 (u'Interior Design', 4),
 (u'Handyman', 4),
 (u'Vegetarian', 4),
 (u'Piercing', 4),
 (u'Seafood', 4),
 (u'Discount Store', 4),
 (u'Music & DVDs', 4),
 (u'Pet Training', 4),
 (u'Oral Surgeons', 4),
 (u'Pool Halls', 4),
 (u'Vape Shops', 4),
 (u'Photographers', 4),
 (u'Tree Services', 4),
 (u'Chocolatiers & Shops', 4),
 (u'Art Supplies', 4),
 (u'Recycling Center', 4),
 (u'Dance Clubs', 4),
 (u'Makeup Artists', 4),
 (u'Mobile Phone Repair', 4),
 (u'Wine Bars', 4),
 (u'Shipping Centers', 4),
 (u'Local Flavor', 4),
 (u'Soup', 4),
 (u'Notaries', 4),
 (u'Jewelry Repair', 4),
 (u'Karaoke', 4),
 (u'Musical Instruments & Teachers', 4),
 (u'Kitchen & Bath', 4),
 (u'Health Markets', 4),
 (u'Cosmetic Dentists', 4),
 (u'Tattoo', 4),
 (u'Electronics Repair', 4),
 (u'Carpet Cleaning', 4),
 (u'Landscaping', 4),
 (u'Real Estate Agents', 4),
 (u'Auto Glass Services', 4),
 (u'Truck Rental', 3),
 (u'Creperies', 3),
 (u'Formal Wear', 3),
 (u'Fabric Stores', 3),
 (u'Outdoor Gear', 3),
 (u'Orthodontists', 3),
 (u'Shoe Repair', 3),
 (u'Cajun/Creole', 3),
 (u'Greek', 3),
 (u'Ophthalmologists', 3),
 (u'Couriers & Delivery Services', 3),
 (u'Office Cleaning', 3),
 (u'Shaved Ice', 3),
 (u'Chiropractors', 3),
 (u'Dive Bars', 3),
 (u'Acupuncture', 3),
 (u'Damage Restoration', 3),
 (u'Szechuan', 3),
 (u'Keys & Locksmiths', 3),
 (u'Recreation Centers', 3),
 (u'Computers', 3),
 (u'Pest Control', 3),
 (u'Comfort Food', 3),
 (u'Hair Extensions', 3),
 (u'Session Photography', 3),
 (u'Mediterranean', 3),
 (u'Education', 3),
 (u'Insurance', 3),
 (u'Gutter Services', 3),
 (u'Internet Service Providers', 3),
 (u'Car Rental', 3),
 (u'Toy Stores', 3),
 (u'Imported Food', 3),
 (u'Banks & Credit Unions', 3),
 (u'Security Systems', 3),
 (u'Vegan', 3),
 (u'Windshield Installation & Repair', 3),
 (u'Snow Removal', 3),
 (u'Cheese Shops', 3),
 (u'Bike Repair/Maintenance', 3),
 (u'Medical Supplies', 3),
 (u'Buffets', 2),
 (u'Auto Insurance', 2),
 (u'Community Service/Non-Profit', 2),
 (u'Herbs & Spices', 2),
 (u'Noodles', 2),
 (u'Watch Repair', 2),
 (u'Empanadas', 2),
 (u'Libraries', 2),
 (u'Taiwanese', 2),
 (u'Vietnamese', 2),
 (u'Video Game Stores', 2),
 (u'Air Duct Cleaning', 2),
 (u'Transmission Repair', 2),
 (u'Blow Dry/Out Services', 2),
 (u'Beer Gardens', 2),
 (u'Hot Dogs', 2),
 (u'Pediatric Dentists', 2),
 (u'Laundry Services', 2),
 (u'Watches', 2),
 (u'Lawyers', 2),
 (u'Tapas Bars', 2),
 (u'Public Transportation', 2),
 (u'Cupcakes', 2),
 (u'Martial Arts', 2),
 (u'Television Service Providers', 2),
 (u'Knitting Supplies', 2),
 (u'Arcades', 2),
 (u'Escape Games', 2),
 (u'Gluten-Free', 2),
 (u'Gelato', 2),
 (u'Motorcycle Dealers', 2),
 (u'Medical Centers', 2),
 (u'Do-It-Yourself Food', 2),
 (u'Junk Removal & Hauling', 2),
 (u'Gas & Service Stations', 2),
 (u'Fish & Chips', 2),
 (u'Farmers Market', 2),
 (u'Beer Bar', 2),
 (u'Tours', 2),
 (u'Commercial Real Estate', 2),
 (u'Guns & Ammo', 2),
 (u'Southern', 2),
 (u'Candy Stores', 2),
 (u'Vinyl Records', 2),
 (u'Donuts', 2),
 (u'Painters', 2),
 (u'Counseling & Mental Health', 2),
 (u'Motorcycle Repair', 2),
 (u'Street Vendors', 2),
 (u'Signmaking', 2),
 (u'Party Supplies', 2),
 (u'Public Services & Government', 2),
 (u'Carpeting', 2),
 (u'Plus Size Fashion', 2),
 (u'Event Photography', 2),
 (u'Adult Entertainment', 2),
 (u'Home & Rental Insurance', 2),
 (u'Framing', 2),
 (u'Graphic Design', 2),
 (u'Parking', 2),
 (u'Videos & Video Game Rental', 2),
 (u'Gardeners', 2),
 (u'Windows Installation', 2),
 (u'Meat Shops', 2),
 (u'Endodontists', 1),
 (u'Organic Stores', 1),
 (u'Architects', 1),
 (u'Bowling', 1),
 (u'Vinyl Siding', 1),
 (u'Vitamins & Supplements', 1),
 (u'Real Estate Law', 1),
 (u'Laser Eye Surgery/Lasik', 1),
 (u'Diagnostic Services', 1),
 (u'Food Stands', 1),
 (u'Shopping Centers', 1),
 (u'Mountain Biking', 1),
 (u'Elementary Schools', 1),
 (u'Animal Shelters', 1),
 (u'Pool & Billiards', 1),
 (u'Zoos', 1),
 (u'Hookah Bars', 1),
 (u'Middle Eastern', 1),
 (u'Trusts', 1),
 (u'Landscape Architects', 1),
 (u'Divorce & Family Law', 1),
 (u'Costumes', 1),
 (u'Souvenir Shops', 1),
 (u'Used Bookstore', 1),
 (u'International Grocery', 1),
 (u'Kosher', 1),
 (u'Climbing', 1),
 (u'Basque', 1),
 (u'Electricians', 1),
 (u'Furniture Reupholstery', 1),
 (u'Web Design', 1),
 (u'Screen Printing', 1),
 (u'Buses', 1),
 (u'Wills', 1),
 (u'Comic Books', 1),
 (u'Dance Studios', 1),
 (u'Shades & Blinds', 1),
 (u'Fitness/Exercise Equipment', 1),
 (u'Cocktail Bars', 1),
 (u'Mortgage Brokers', 1),
 (u'Batting Cages', 1),
 (u'Shutters', 1),
 (u'Window Washing', 1),
 (u'Estate Planning Law', 1),
 (u'Mobile Phone Accessories', 1),
 (u'Dog Walkers', 1),
 (u'CSA', 1),
 (u'Museums', 1),
 (u'Performing Arts', 1),
 (u'Office Equipment', 1),
 (u'Naturopathic/Holistic', 1),
 (u'Smokehouse', 1),
 (u'Garage Door Services', 1),
 (u'& Probates', 1),
 (u'Outlet Stores', 1),
 (u'Trampoline Parks', 1),
 (u'Aquariums', 1),
 (u'Trailer Repair', 1),
 (u'Embroidery & Crochet', 1),
 (u'Jazz & Blues', 1),
 (u'Reflexology', 1),
 (u'Lingerie', 1),
 (u'Criminal Defense Law', 1),
 (u'Motorcycle Gear', 1),
 (u'Health Retreats', 1),
 (u'Newspapers & Magazines', 1),
 (u'Irish', 1),
 (u'Amusement Parks', 1),
 (u'Door Sales/Installation', 1),
 (u'Spray Tanning', 1),
 (u'Specialty Schools', 1),
 (u'Fertility', 1),
 (u'Personal Assistants', 1),
 (u'Personal Chefs', 1),
 (u'Caribbean', 1),
 (u'Roadside Assistance', 1),
 (u'Airports', 1),
 (u"Children's Clothing", 1),
 (u'Utilities', 1),
 (u'Argentine', 1),
 (u'Screen Printing/T-Shirt Printing', 1),
 (u'Middle Schools & High Schools', 1),
 (u'Cinema', 1),
 (u'Gun/Rifle Ranges', 1),
 (u'Gay Bars', 1),
 (u'Religious Organizations', 1),
 (u'Physical Therapy', 1),
 (u'Hot Pot', 1),
 (u'Casinos', 1),
 (u'Cards & Stationery', 1),
 (u'Irrigation', 1),
 (u'Mongolian', 1),
 (u'Life Insurance', 1),
 (u'Carpenters', 1),
 (u'RV Repair', 1),
 (u'Luggage', 1),
 (u'Masonry/Concrete', 1),
 (u'Appraisal Services', 1),
 (u'Chimney Sweeps', 1),
 (u'Wholesale Stores', 1),
 (u'Arabian', 1),
 (u'Churches', 1),
 (u'Wigs', 1),
 (u'Skating Rinks', 1),
 (u'Brewing Supplies', 1),
 (u'Immigration Law', 1),
 (u'Obstetricians & Gynecologists', 1),
 (u'Musical Instrument Services', 1),
 (u'Laboratory Testing', 1),
 (u'Train Stations', 1),
 (u'Bike Rentals', 1),
 (u'Wineries', 1),
 (u'Lighting Fixtures & Equipment', 1),
 (u'Wheel & Rim Repair', 1),
 (u'Unofficial Yelp Events', 1),
 (u'Cabinetry', 1),
 (u'Periodontists', 1),
 (u'Pakistani', 1),
 (u'Piano Stores', 1),
 (u'Mini Golf', 1),
 (u'Car Stereo Installation', 1),
 (u'Breweries', 1),
 (u'Malaysian', 1),
 (u'Custom Cakes', 1),
 (u'DUI Law', 1),
 (u'Golf', 1),
 (u'Seafood Markets', 1),
 (u'Flooring', 1),
 (u'Parks', 1),
 (u'Glass & Mirrors', 1),
 (u'Home Staging', 1),
 (u'Olive Oil', 1),
 (u'Coffee Roasteries', 1),
 (u'Ticket Sales', 1),
 (u'Traditional Chinese Medicine', 1),
 (u'Spanish', 1)]

Here we should set the list of categories we are interested.


In [ ]:

7. 'Neighborhood'


In [124]:
df_uc_open.neighborhood.unique()


Out[124]:
array([u''], dtype=object)

All 'Neighborhood' == None

8. 'Attribute'

First of all, remove records with None 'Attribute'


In [13]:
df_uc_open = df_uc_open.dropna(subset=['attributes'])
df_uc_open.reset_index(drop=True, inplace=True)
df_uc_open.shape
# 814 -> ?


Out[13]:
(729, 16)

In [204]:
# df_uc_open.attributes
# import yaml

In [202]:
att_count = {}
for i in df_uc_open.attributes:
    data = yaml.load('['+','.join(i)+']')
    for d in data:
        att_count[d.keys()[0]] = att_count.get(d.keys()[0], 0) + 1

In [203]:
sorted(att_count.iteritems(), key=lambda x: x[1], reverse=True)


Out[203]:
[('BusinessAcceptsCreditCards', 690),
 ('RestaurantsPriceRange2', 553),
 ('BusinessParking', 507),
 ('BikeParking', 454),
 ('WheelchairAccessible', 329),
 ('GoodForKids', 300),
 ('RestaurantsTakeOut', 292),
 ('OutdoorSeating', 274),
 ('RestaurantsGoodForGroups', 264),
 ('RestaurantsDelivery', 261),
 ('Alcohol', 257),
 ('RestaurantsReservations', 255),
 ('WiFi', 253),
 ('HasTV', 252),
 ('Ambience', 250),
 ('NoiseLevel', 242),
 ('RestaurantsAttire', 240),
 ('GoodForMeal', 240),
 ('RestaurantsTableService', 223),
 ('Caters', 202),
 ('ByAppointmentOnly', 169),
 ('BusinessAcceptsBitcoin', 68),
 ('GoodForDancing', 57),
 ('DogsAllowed', 56),
 ('CoatCheck', 55),
 ('HappyHour', 54),
 ('Music', 52),
 ('Smoking', 49),
 ('BestNights', 46),
 ('BYOBCorkage', 39),
 ('DriveThru', 35),
 ('BYOB', 31),
 ('AcceptsInsurance', 22),
 ('Corkage', 19),
 ('RestaurantsCounterService', 12),
 ('HairSpecializesIn', 10),
 ('Open24Hours', 3),
 ('AgesAllowed', 3),
 ('DietaryRestrictions', 2)]

In [207]:
df_uc_open.shape[0]


Out[207]:
729

We need a threshold to decide which attributes we use to build model.


In [212]:
data = yaml.load('['+','.join(df_uc_open.attributes[452])+']')
data


Out[212]:
[{'Alcohol': 'full_bar'},
 {'Ambience': {'casual': False,
   'classy': False,
   'divey': False,
   'hipster': False,
   'intimate': False,
   'romantic': False,
   'touristy': False,
   'trendy': False,
   'upscale': False}},
 {'BusinessAcceptsCreditCards': True},
 {'NoiseLevel': 'average'},
 {'RestaurantsAttire': 'dressy'},
 {'RestaurantsGoodForGroups': True},
 {'RestaurantsPriceRange2': 3},
 {'RestaurantsReservations': True},
 {'RestaurantsTableService': True},
 {'BikeParking': True},
 {'GoodForKids': False},
 {'OutdoorSeating': True},
 {'BusinessParking': {'garage': False,
   'lot': False,
   'street': True,
   'valet': False,
   'validated': False}},
 {'HasTV': True},
 {'RestaurantsTakeOut': True},
 {'RestaurantsDelivery': True},
 {'Caters': True},
 {'WheelchairAccessible': True},
 {'WiFi': 'free'},
 {'GoodForMeal': {'breakfast': False,
   'brunch': False,
   'dessert': False,
   'dinner': True,
   'latenight': False,
   'lunch': False}},
 {'BestNights': {'friday': False,
   'monday': False,
   'saturday': False,
   'sunday': False,
   'thursday': True,
   'tuesday': True,
   'wednesday': True}},
 {'CoatCheck': True},
 {'GoodForDancing': False},
 {'Music': {'background_music': True,
   'dj': False,
   'jukebox': False,
   'karaoke': False,
   'live': False,
   'no_music': False,
   'video': False}},
 {'Smoking': False}]

There are 2 kinds of attributes:

* Binary 
    * 0 or 1
    * True or False
* Multiple
    * dict eg. 'GoodForMeal'
        * if ALL False, ?
        * if more than ONE True, ? eg. 'BestNights'
    * multi-value eg. 'RestaurantsPriceRange2' 1,2,3... or 'Alcohol'
* Anything Else


* How to deal with missing values?

9. 'Stars'


In [252]:
pl.hist(df_uc_open.stars)


Out[252]:
(array([  12.,   19.,   32.,   55.,    0.,  127.,  162.,  138.,   94.,   90.]),
 array([ 1. ,  1.4,  1.8,  2.2,  2.6,  3. ,  3.4,  3.8,  4.2,  4.6,  5. ]),
 <a list of 10 Patch objects>)

In [228]:
df_uc_open.stars.describe()


Out[228]:
count    729.000000
mean       3.587106
std        0.934106
min        1.000000
25%        3.000000
50%        3.500000
75%        4.500000
max        5.000000
Name: stars, dtype: float64

In [242]:
s = 3.9
1.0 * sum(df_uc_open.stars > s) / 729


Out[242]:
0.44170096021947874

If we would like to build a binary classification model, here we could set threshold 'star' as '<4.0'


In [241]:
df_uc_open.stars.value_counts() /729


Out[241]:
3.5    0.222222
4.0    0.189300
3.0    0.174211
4.5    0.128944
5.0    0.123457
2.5    0.075446
2.0    0.043896
1.5    0.026063
1.0    0.016461
Name: stars, dtype: float64

In [275]:
pl.hist?

In [276]:
c = ['Champaign', 'Urbana']
print c[0], '\n',df_uc_open[df_uc_open.city == c[0]].stars.describe()
pl.hist(df_uc_open[df_uc_open.city == c[0]].stars, normed=True)


Champaign 
count    559.000000
mean       3.567084
std        0.943507
min        1.000000
25%        3.000000
50%        3.500000
75%        4.000000
max        5.000000
Name: stars, dtype: float64
Out[276]:
(array([ 0.04472272,  0.07155635,  0.1118068 ,  0.18783542,  0.        ,
         0.45169946,  0.5411449 ,  0.48300537,  0.29964222,  0.30858676]),
 array([ 1. ,  1.4,  1.8,  2.2,  2.6,  3. ,  3.4,  3.8,  4.2,  4.6,  5. ]),
 <a list of 10 Patch objects>)

In [277]:
print c[1], '\n',df_uc_open[df_uc_open.city == c[1]].stars.describe()
pl.hist(df_uc_open[df_uc_open.city == c[1]].stars, normed=True)


Urbana 
count    170.000000
mean       3.652941
std        0.902080
min        1.000000
25%        3.000000
50%        3.500000
75%        4.500000
max        5.000000
Name: stars, dtype: float64
Out[277]:
(array([ 0.02941176,  0.04411765,  0.10294118,  0.19117647,  0.        ,
         0.38235294,  0.60294118,  0.44117647,  0.39705882,  0.30882353]),
 array([ 1. ,  1.4,  1.8,  2.2,  2.6,  3. ,  3.4,  3.8,  4.2,  4.6,  5. ]),
 <a list of 10 Patch objects>)

output df_uc_open


In [253]:
print df.shape, df_uc.shape, df_uc_open.shape


(1556, 16) (1354, 16) (729, 16)

In [15]:
df_uc_open.to_pickle("UC01_df_uc_open.p")