In [1]:
%pylab inline

import pandas as pd
import json
import sqlite3
import ast


Populating the interactive namespace from numpy and matplotlib

In [2]:
reviews = pd.DataFrame(json.loads(l) for l in open('/home/russ/Documents/DDL/Data/YelpData/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json'))

In [3]:
reviews['useful'] = reviews['votes'].apply(lambda v: v['useful'])

In [2]:
location_db = '/home/russ/Documents/DDL/Projects/03-censusables/databases/Censusables.db'
conn = sqlite3.connect(location_db)
reviews_db = pd.read_sql("SELECT * FROM 'Yelp.Reviews'", con=conn)

In [7]:
reviews_db.head(1)


Out[7]:
row_id type business_id user_id stars text date votes
0 1 review Xqd0DzHaiyRqVH3WRG7hzg vcNAWiLM4dR7D2nwwJ7nCA 5 dr. goldberg offers everything i look for in a... 2007-05-17 {u'funny': 0, u'useful': 2, u'cool': 1}

In [26]:
import ast
def json_convert (v):
    d = json.loads(v)
    return d['useful']

my_dict = ast.literal_eval("{u'funny': 0, u'useful': 2, u'cool': 1}")
print my_dict['useful']
#reviews_db ['useful'] = reviews_db['votes'].apply(json_convert)


2

In [3]:
reviews_db['useful'] = reviews_db['votes'].apply(lambda v: ast.literal_eval(v)['useful'])

In [4]:
reviews.head(1)


Out[4]:
business_id date review_id stars text type user_id votes useful
0 vcNAWiLM4dR7D2nwwJ7nCA 2007-05-17 15SdjuK7DmYqUAj6rjGowg 5 dr. goldberg offers everything i look for in a... review Xqd0DzHaiyRqVH3WRG7hzg {u'funny': 0, u'useful': 2, u'cool': 1} 2

In [31]:
reviews_db.head(1)


Out[31]:
row_id type business_id user_id stars text date votes useful
0 1 review Xqd0DzHaiyRqVH3WRG7hzg vcNAWiLM4dR7D2nwwJ7nCA 5 dr. goldberg offers everything i look for in a... 2007-05-17 {u'funny': 0, u'useful': 2, u'cool': 1} 2

In [4]:
ytract = pd.DataFrame(json.loads(l) for l in open('/home/russ/Documents/DDL/Data/YelpData/business_tract.json'))
business = pd.DataFrame(json.loads(l) for l in open('/home/russ/Documents/DDL/Data/YelpData/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json'))

In [3]:
business.head(1)


Out[3]:
attributes business_id categories city full_address hours latitude longitude name neighborhoods open review_count stars state type
0 {u'By Appointment Only': True} vcNAWiLM4dR7D2nwwJ7nCA [Doctors, Health & Medical] Phoenix 4840 E Indian School Rd\nSte 101\nPhoenix, AZ ... {u'Thursday': {u'close': u'17:00', u'open': u'... 33.499313 -111.983758 Eric Goldberg, MD [] True 9 3.5 AZ business

In [5]:
business = business.merge(ytract)

In [ ]:


In [46]:
categories = {}
for d in (json.loads(l) for l in open('/home/russ/Documents/DDL/Data/YelpData/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json')):
    for c in d['categories']:
        if c in categories:
            categories[c] += 1
        else:
            categories[c] = 1

In [30]:
category_df = pd.DataFrame(categories.items(),columns=['Category','Count'])
category_df = category_df.sort('Count',ascending=False)
category_df


Out[30]:
Category Count
31 Restaurants 21892
454 Shopping 8919
348 Food 7862
239 Beauty & Spas 4738
215 Nightlife 4340
663 Bars 3628
548 Health & Medical 3213
63 Automotive 2965
147 Home Services 2853
614 Fashion 2566
230 Active Life 2470
626 Event Planning & Services 2467
749 Fast Food 2383
150 Pizza 2223
773 Mexican 2208
365 Local Services 2144
558 Hotels & Travel 2131
332 American (Traditional) 2113
193 Sandwiches 1981
391 Arts & Entertainment 1946
251 Coffee & Tea 1890
419 Italian 1633
86 Chinese 1496
440 American (New) 1494
272 Burgers 1481
247 Hair Salons 1388
533 Hotels 1307
59 Nail Salons 1256
23 Grocery 1233
329 Auto Repair 1220
... ... ...
122 Pole Dancing Classes 1
124 Pita 1
667 Pop-up Shops 1
109 Donairs 1
131 Bus Tours 1
132 Haitian 1
625 Mosques 1
187 Surf Shop 1
597 Day Camps 1
192 Oriental 1
194 Swiss Food 1
198 Eastern German 1
583 Bartending Schools 1
71 Walking Tours 1
64 Eastern European 1
211 Basketball Courts 1
568 Used Bookstore 1
212 Bike Sharing 1
58 Firearm Training 1
545 Walk-in Clinics 1
725 Czech 1
534 Golf Equipment Shops 1
520 Castles 1
737 Hang Gliding 1
44 Customized Merchandise 1
741 Psychologists 1
282 Furniture Repair 1
283 Bavarian 1
289 Hospice 1
782 Cabaret 1

783 rows × 2 columns


In [ ]:
category_df.to_csv('counts_by_category.csv')

In [11]:
"""cats = []
for ind, row in business.iterrows():
    #cats.append(dict(business_id = business['business_id'], category=c))
    print row
    break

#cats = pd.DataFrame(cats)   """


attributes                          {u'By Appointment Only': True}
business_id                                 vcNAWiLM4dR7D2nwwJ7nCA
categories                             [Doctors, Health & Medical]
city                                                       Phoenix
full_address     4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...
hours            {u'Thursday': {u'close': u'17:00', u'open': u'...
latitude                                                  33.49931
longitude                                                -111.9838
name                                             Eric Goldberg, MD
neighborhoods                                                   []
open                                                          True
review_count                                                     9
stars                                                          3.5
state                                                           AZ
type                                                      business
GISJOIN                                             G0400130108200
Name: 0, dtype: object

In [93]:
"""def SeriesFromSubList(aList):
    return pd.Series(aList)


df_orgz = pd.concat(dict([ (ind,row.apply(SeriesFromSubList)) for ind,row in business.iterrows() ]))"""


Out[93]:
'def SeriesFromSubList(aList):\n    return pd.Series(aList)\n\n\ndf_orgz = pd.concat(dict([ (ind,row.apply(SeriesFromSubList)) for ind,row in business.iterrows() ]))'

In [6]:
#Create BusinessID by Individual Category
cats = []
for d in (json.loads(l) for l in open('/home/russ/Documents/DDL/Data/YelpData/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json')):
    for c in d['categories']:
        cats.append(dict(business_id = d['business_id'], category=c))
cats = pd.DataFrame(cats)

In [9]:
cat_lookup = pd.read_csv('/home/russ/Documents/DDL/Data/YelpData/counts_by_category_recat.csv', header=0, usecols=['Category','Count','New'])
cat_lookup.columns = ['OldCategory','Count','Category']

In [6]:
cat_lookup.head(1)


Out[6]:
OldCategory Count Category
0 Restaurants 21892 Restaurants

In [10]:
cat_combined = pd.merge(cat_lookup[['OldCategory','Count']], cat_lookup[['OldCategory','Category']],left_on='OldCategory', right_on='Category')

In [9]:
cat_combined.head(3)


Out[9]:
OldCategory_x Count OldCategory_y Category
0 Restaurants 21892 Restaurants Restaurants
1 Restaurants 21892 Food Restaurants
2 Restaurants 21892 Pizza Restaurants

In [11]:
cat_combined.drop(['OldCategory_x'],axis=1, inplace=True)
cat_combined.columns = ['Count','OldCategory','Category']
cat_combined


Out[11]:
Count OldCategory Category
0 21892 Restaurants Restaurants
1 21892 Food Restaurants
2 21892 Pizza Restaurants
3 21892 Mexican Restaurants
4 21892 American (Traditional) Restaurants
5 21892 Sandwiches Restaurants
6 21892 Coffee & Tea Restaurants
7 21892 Italian Restaurants
8 21892 Chinese Restaurants
9 21892 American (New) Restaurants
10 21892 Burgers Restaurants
11 21892 Breakfast & Brunch Restaurants
12 21892 Specialty Food Restaurants
13 21892 Ice Cream & Frozen Yogurt Restaurants
14 21892 Cafes Restaurants
15 21892 Japanese Restaurants
16 21892 Sushi Bars Restaurants
17 21892 Delis Restaurants
18 21892 Steakhouses Restaurants
19 21892 Seafood Restaurants
20 21892 Mediterranean Restaurants
21 21892 Barbeque Restaurants
22 21892 Thai Restaurants
23 21892 Asian Fusion Restaurants
24 21892 French Restaurants
25 21892 Buffets Restaurants
26 21892 Indian Restaurants
27 21892 Greek Restaurants
28 21892 Juice Bars & Smoothies Restaurants
29 21892 Diners Restaurants
... ... ... ...
731 271 Bike Sharing Transportation
732 229 Caterers Caterers
733 225 Casinos Casinos
734 216 Shipping Centers Shipping Centers
735 210 Party & Event Planning Party & Event Planning
736 206 Yoga Yoga
737 195 Tobacco Shops Tobacco Shops
738 178 Cinema Cinema
739 167 Religious Organizations Religious Organizations
740 167 Churches Religious Organizations
741 167 Hindu Temples Religious Organizations
742 167 Buddhist Temples Religious Organizations
743 167 Synagogues Religious Organizations
744 167 Mosques Religious Organizations
745 165 Photographers Photographers
746 157 Museums Museums
747 156 Nurseries & Gardening Nurseries & Gardening
748 151 Tattoo Tattoo
749 151 Piercing Tattoo
750 151 Permanent Makeup Tattoo
751 151 Tattoo Removal Tattoo
752 132 Breweries Breweries
753 132 Wineries Breweries
754 132 Distilleries Breweries
755 128 Self Storage Self Storage
756 93 Landmarks & Historical Buildings Landmarks & Historical Buildings
757 73 Stadiums & Arenas Stadiums & Arenas
758 70 Dance Studios Dance Studios
759 18 Zoos Zoos
760 18 Aquariums Zoos

761 rows × 3 columns


In [ ]:
#cats = cats.merge(ytract)

#counts_by_geocat.to_csv('counts_by_geocat.csv')

In [12]:
business_category_count = pd.merge (cats[['business_id','category']],cat_combined[['OldCategory','Category','Count']],left_on='category',right_on='OldCategory').sort(['Count','OldCategory'],ascending=[0,1])

In [13]:
business_category_count.drop(['category'],axis=1,inplace=True)

In [14]:
business_category_count


Out[14]:
business_id OldCategory Category Count
171030 0pXk2JkijBpi2Iz0D5pFXA Afghan Restaurants 21892
171031 vt0zrlzlHOcvXZsZ0Y-D_g Afghan Restaurants 21892
171032 s5yYbHcxvHnNyHF7uVNUbg Afghan Restaurants 21892
171033 uR96BDuI8YR7gqbc-Ncg9A Afghan Restaurants 21892
171034 9qi6ZfYzjGuwEzQ_FtBBgw Afghan Restaurants 21892
171035 pvMeUi5Lw2O5vIXg9_EvDg Afghan Restaurants 21892
171036 saON8-nDZi5W64u1xGtmPA Afghan Restaurants 21892
171037 VH2sLFMP8gWQm0skMZqaVw Afghan Restaurants 21892
171038 mFcB-ohmJIi1xuZadKbuww Afghan Restaurants 21892
171039 TZhv0cOt0UHoORlG0I5JHg Afghan Restaurants 21892
171040 dcoG-HYQEAgvFzcBtQb_yg Afghan Restaurants 21892
171041 0IV5L36XeQShzWiUhom9bQ Afghan Restaurants 21892
171042 rRsAuY4e9T5EscglUSRU6w Afghan Restaurants 21892
170917 rgSSCx0tb9AQyCkAvMSIoQ African Restaurants 21892
170918 5fqbCefPHwNrCzuxnhCHgg African Restaurants 21892
170919 9nC58L8jwheiihZfc3poJA African Restaurants 21892
170920 ShObkipoAQV3I_vn_f-mDg African Restaurants 21892
170921 -3hB9COlwRVhTf1PLD7sUw African Restaurants 21892
170922 vNL25b0LJj38Z2_H1aT3bA African Restaurants 21892
170923 i2IHXrDgGumZ32MdZLIgaA African Restaurants 21892
170924 jgUtE_lFxqhMcG4ubWCQLg African Restaurants 21892
170925 7wzVfsgF47C7fUunqGdahA African Restaurants 21892
170926 8w7Q0cSYssJGDI9WVxJB_w African Restaurants 21892
170927 UACSWwOwCFg58ycvb32-RQ African Restaurants 21892
170928 Xk5iznUPktX7wx3dlRVWUA African Restaurants 21892
170929 W0-FuAN2yFAsJf4FPB9J-Q African Restaurants 21892
170930 3Dk5qrbYaP7lCYyjmo0JRg African Restaurants 21892
170931 BPmeW2bZBY4aQbu_E4Co8w African Restaurants 21892
170932 gNyaalWR6m06rig3c7h0bQ African Restaurants 21892
170933 lTvW5TcIVIVNmE4_LYQ4eA African Restaurants 21892
... ... ... ... ...
170231 eHTykuLzs6DiDyw7t2g6DQ Aquariums Zoos 18
170232 NPDlIsGY50IfLx4fO95vCg Aquariums Zoos 18
170233 NBFu9Ia-KzpsByWtoB6JhA Aquariums Zoos 18
170234 s6DB2eqwmzV6sz_dNMyZBg Aquariums Zoos 18
170235 YEBetzOA7seX-jtiCTrdJA Aquariums Zoos 18
170236 4hmmTmWdC2d7kXhCmWZ8kg Aquariums Zoos 18
170237 GfebULgs3_mmwvPSSehxog Aquariums Zoos 18
170238 18_OGahBYBWbXYLeI0jzzw Aquariums Zoos 18
170239 J-aSZ9bF66K29siVCwZXMQ Aquariums Zoos 18
170240 LByAiRSiU7zRLBREskpIrg Aquariums Zoos 18
170241 yPcxbIJ-sdzxcrjPFvi3Sw Aquariums Zoos 18
170242 yZSeLYxsxYM3tobk2NLDkw Aquariums Zoos 18
164938 1Ap6ZNCvyLLKHP0wvCk9yA Zoos Zoos 18
164939 kEsKrIJ6M1KOb8UALrpP7g Zoos Zoos 18
164940 m0R4GViXeLOWuOlzoCv3OQ Zoos Zoos 18
164941 eHTykuLzs6DiDyw7t2g6DQ Zoos Zoos 18
164942 wTVgDIHonZITOqrFfVZ0gg Zoos Zoos 18
164943 QpglndObav8JRGqA-HkupA Zoos Zoos 18
164944 eMnFm6h_rzraY1OflB-npw Zoos Zoos 18
164945 s6DB2eqwmzV6sz_dNMyZBg Zoos Zoos 18
164946 hvRndaCCYmhdemoNMraSqA Zoos Zoos 18
164947 E2xH3yV4Esp2aQElwqDTbg Zoos Zoos 18
164948 s4TOMVsHoQxmrx1mEzTgaA Zoos Zoos 18
164949 hyCBns-n8V1N2CSgSt-ogg Zoos Zoos 18
164950 QcCrY-qvQatWpJaHIlhX9Q Zoos Zoos 18
164951 972NN0GHeKb-KP2HQpS-Cg Zoos Zoos 18
164952 uXticN42QnpqzoRB1Htc1Q Zoos Zoos 18
164953 M99oA5-pXyqQCAPgiPMuxA Zoos Zoos 18
164954 16xuPTYUnhopeb-4k209iw Zoos Zoos 18
164955 mVwUmftw4KcmLepuCyBosA Zoos Zoos 18

172603 rows × 4 columns


In [171]:
#business_grouping = business_category_count[['business_id','Category','Count']].groupby(['business_id','Category']).max()

In [182]:
#business_grouping

In [74]:
business_category_count[business_category_count['business_id'] == '--1emggGHgoG6ipd_RMb-g']


Out[74]:
business_id OldCategory Category Count
67077 --1emggGHgoG6ipd_RMb-g Food Restaurants 21892
153290 --1emggGHgoG6ipd_RMb-g Convenience Stores Convenience Stores 578

In [16]:
business_category_count.groupby('business_id').apply(lambda t: t[t.Count==t.Count.max()])


Out[16]:
business_id OldCategory Category Count
business_id
--1emggGHgoG6ipd_RMb-g 67077 --1emggGHgoG6ipd_RMb-g Food Restaurants 21892
--4Pe8BZ6gj57VFL5mUE8g 139801 --4Pe8BZ6gj57VFL5mUE8g Office Equipment Shopping 8919
16332 --4Pe8BZ6gj57VFL5mUE8g Shopping Shopping 8919
--5jkZ3-nUPZxUvtcbr8Uw 136920 --5jkZ3-nUPZxUvtcbr8Uw Greek Restaurants 21892
131648 --5jkZ3-nUPZxUvtcbr8Uw Mediterranean Restaurants 21892
45339 --5jkZ3-nUPZxUvtcbr8Uw Restaurants Restaurants 21892
--7PRjnsjMA6uhPK8mW13Q 4337 --7PRjnsjMA6uhPK8mW13Q Nightlife Nightlife 4340
--BlvDO_RG2yElKu9XA1_g 81852 --BlvDO_RG2yElKu9XA1_g Chinese Restaurants 21892
166938 --BlvDO_RG2yElKu9XA1_g Hawaiian Restaurants 21892
44644 --BlvDO_RG2yElKu9XA1_g Restaurants Restaurants 21892
137660 --BlvDO_RG2yElKu9XA1_g Sushi Bars Restaurants 21892
--Dl2rW_xO8GuYBomlg9zw 3120 --Dl2rW_xO8GuYBomlg9zw Health & Medical Health & Medical 3213
161866 --Dl2rW_xO8GuYBomlg9zw Medical Centers Health & Medical 3213
--NCcmQqh_NZbjW1S5XsGQ 66086 --NCcmQqh_NZbjW1S5XsGQ Food Restaurants 21892
--Ol5mVSMaW8ExtmWRUmKA 114567 --Ol5mVSMaW8ExtmWRUmKA Beauty & Spas Beauty & Spas 4738
--UE_y6auTgq3FXlvUMkbw 134041 --UE_y6auTgq3FXlvUMkbw Diners Restaurants 21892
29725 --UE_y6auTgq3FXlvUMkbw Restaurants Restaurants 21892
--WLj8iK9DUKxkPnMHAoeA 74053 --WLj8iK9DUKxkPnMHAoeA Event Planning & Services Event Planning & Services 2467
--XBxRlD92RaV6TyUnP8Ow 13386 --XBxRlD92RaV6TyUnP8Ow Shopping Shopping 8919
--Y_2lDOtVDioX5bwF6GIw 144256 --Y_2lDOtVDioX5bwF6GIw Comfort Food Restaurants 21892
47846 --Y_2lDOtVDioX5bwF6GIw Restaurants Restaurants 21892
--jFTZmywe7StuZ2hEjxyA 36461 --jFTZmywe7StuZ2hEjxyA Restaurants Restaurants 21892
103601 --jFTZmywe7StuZ2hEjxyA Sandwiches Restaurants 21892
--m1g9P1wxNblrLANfVqlA 16152 --m1g9P1wxNblrLANfVqlA Shopping Shopping 8919
--nQiUBxtpjd_ZBuO_PH6w 165054 --nQiUBxtpjd_ZBuO_PH6w Child Care & Day Care Local Services 2144
79295 --nQiUBxtpjd_ZBuO_PH6w Local Services Local Services 2144
--pOlFxITWnhzc7SHSIP0A 29288 --pOlFxITWnhzc7SHSIP0A American (New) Restaurants 21892
48371 --pOlFxITWnhzc7SHSIP0A Restaurants Restaurants 21892
--qeSYxyn62mMjWvznNTdg 81286 --qeSYxyn62mMjWvznNTdg Chinese Restaurants 21892
35768 --qeSYxyn62mMjWvznNTdg Restaurants Restaurants 21892
... ... ... ... ... ...
zzUUj9BUNn0_0F5hwy7RRw 81744 zzUUj9BUNn0_0F5hwy7RRw Chinese Restaurants 21892
42846 zzUUj9BUNn0_0F5hwy7RRw Restaurants Restaurants 21892
zzW_Gu-O5UpieKKp0Vd_bQ 108872 zzW_Gu-O5UpieKKp0Vd_bQ Children's Clothing Shopping 8919
12593 zzW_Gu-O5UpieKKp0Vd_bQ Shopping Shopping 8919
zzZizlQn7Wfi-tFG9OiJ9w 74369 zzZizlQn7Wfi-tFG9OiJ9w Event Planning & Services Event Planning & Services 2467
zze1eI588TqdJLs8xOzMxw 64397 zze1eI588TqdJLs8xOzMxw Food Restaurants 21892
zzgXEteMduykqN8JAUG4CA 29137 zzgXEteMduykqN8JAUG4CA American (New) Restaurants 21892
89814 zzgXEteMduykqN8JAUG4CA Coffee & Tea Restaurants 21892
66555 zzgXEteMduykqN8JAUG4CA Food Restaurants 21892
46734 zzgXEteMduykqN8JAUG4CA Restaurants Restaurants 21892
zzhSCOVTZ7MUKOAfwT-Bsw 11596 zzhSCOVTZ7MUKOAfwT-Bsw Shopping Shopping 8919
zzhykRiQh2FyrYEPMfBw0A 156452 zzhykRiQh2FyrYEPMfBw0A Baby Gear & Furniture Shopping 8919
108944 zzhykRiQh2FyrYEPMfBw0A Children's Clothing Shopping 8919
18107 zzhykRiQh2FyrYEPMfBw0A Shopping Shopping 8919
zzhzW3DImwhSwpcK1Yuy7Q 60067 zzhzW3DImwhSwpcK1Yuy7Q Breakfast & Brunch Restaurants 21892
89383 zzhzW3DImwhSwpcK1Yuy7Q Coffee & Tea Restaurants 21892
65022 zzhzW3DImwhSwpcK1Yuy7Q Food Restaurants 21892
43030 zzhzW3DImwhSwpcK1Yuy7Q Restaurants Restaurants 21892
zzlpdQ2GPcXj7mo0lut3Vg 150444 zzlpdQ2GPcXj7mo0lut3Vg Gift Shops Shopping 8919
17975 zzlpdQ2GPcXj7mo0lut3Vg Shopping Shopping 8919
zzrm5HEoYKEsfdi8XxSXuQ 388 zzrm5HEoYKEsfdi8XxSXuQ Doctors Health & Medical 3213
160718 zzrm5HEoYKEsfdi8XxSXuQ Family Practice Health & Medical 3213
2015 zzrm5HEoYKEsfdi8XxSXuQ Health & Medical Health & Medical 3213
zzs-gtrsGyu2_o8L6vpaIQ 79546 zzs-gtrsGyu2_o8L6vpaIQ Dentists Health & Medical 3213
80286 zzs-gtrsGyu2_o8L6vpaIQ General Dentistry Health & Medical 3213
1296 zzs-gtrsGyu2_o8L6vpaIQ Health & Medical Health & Medical 3213
zztd3r04hvMTZh3ol7VC4g 69800 zztd3r04hvMTZh3ol7VC4g Cafes Restaurants 21892
38764 zztd3r04hvMTZh3ol7VC4g Restaurants Restaurants 21892
zzucnDH33KlmbkI4rLyQTA 11755 zzucnDH33KlmbkI4rLyQTA Shopping Shopping 8919
109004 zzucnDH33KlmbkI4rLyQTA Sporting Goods Shopping 8919

118451 rows × 4 columns


In [17]:
#Get record that has the Max Count of each business_id group
business_new_cat = business_category_count[['business_id','Category','Count']].groupby('business_id').apply(lambda t: t[t.Count==t.Count.max()])

In [18]:
#Drop Duplicates
business_new_cat[['business_id','Category']].drop_duplicates()


Out[18]:
business_id Category
business_id
--1emggGHgoG6ipd_RMb-g 67077 --1emggGHgoG6ipd_RMb-g Restaurants
--4Pe8BZ6gj57VFL5mUE8g 139801 --4Pe8BZ6gj57VFL5mUE8g Shopping
--5jkZ3-nUPZxUvtcbr8Uw 136920 --5jkZ3-nUPZxUvtcbr8Uw Restaurants
--7PRjnsjMA6uhPK8mW13Q 4337 --7PRjnsjMA6uhPK8mW13Q Nightlife
--BlvDO_RG2yElKu9XA1_g 81852 --BlvDO_RG2yElKu9XA1_g Restaurants
--Dl2rW_xO8GuYBomlg9zw 3120 --Dl2rW_xO8GuYBomlg9zw Health & Medical
--NCcmQqh_NZbjW1S5XsGQ 66086 --NCcmQqh_NZbjW1S5XsGQ Restaurants
--Ol5mVSMaW8ExtmWRUmKA 114567 --Ol5mVSMaW8ExtmWRUmKA Beauty & Spas
--UE_y6auTgq3FXlvUMkbw 134041 --UE_y6auTgq3FXlvUMkbw Restaurants
--WLj8iK9DUKxkPnMHAoeA 74053 --WLj8iK9DUKxkPnMHAoeA Event Planning & Services
--XBxRlD92RaV6TyUnP8Ow 13386 --XBxRlD92RaV6TyUnP8Ow Shopping
--Y_2lDOtVDioX5bwF6GIw 144256 --Y_2lDOtVDioX5bwF6GIw Restaurants
--jFTZmywe7StuZ2hEjxyA 36461 --jFTZmywe7StuZ2hEjxyA Restaurants
--m1g9P1wxNblrLANfVqlA 16152 --m1g9P1wxNblrLANfVqlA Shopping
--nQiUBxtpjd_ZBuO_PH6w 165054 --nQiUBxtpjd_ZBuO_PH6w Local Services
--pOlFxITWnhzc7SHSIP0A 29288 --pOlFxITWnhzc7SHSIP0A Restaurants
--qeSYxyn62mMjWvznNTdg 81286 --qeSYxyn62mMjWvznNTdg Restaurants
--sSW-WY3vyASh_eVPGUAw 20646 --sSW-WY3vyASh_eVPGUAw Home Services
--zgHBiQpr8H2ZqSdGmguQ 157045 --zgHBiQpr8H2ZqSdGmguQ Restaurants
-024YEtnIsPQCrMSHCKLQw 69760 -024YEtnIsPQCrMSHCKLQw Restaurants
-05qMWbhAtrD6EiV-UElPg 23924 -05qMWbhAtrD6EiV-UElPg Professional Services
-0Ackw6MF82PXO9f9Jh_Kg 96921 -0Ackw6MF82PXO9f9Jh_Kg Performing Arts
-0D_CYhlD2ILkmLR0pBmnA 65990 -0D_CYhlD2ILkmLR0pBmnA Restaurants
-0GkcDiIgVm0XzDZC8RFOg 51915 -0GkcDiIgVm0XzDZC8RFOg Restaurants
-0HGqwlfw3I8nkJyMHxAsQ 58031 -0HGqwlfw3I8nkJyMHxAsQ Restaurants
-0QBrNvhrPQCaeo7mTo0zQ 33042 -0QBrNvhrPQCaeo7mTo0zQ Restaurants
-0VK5Z1BfUHUYq4PoBYNLw 7681 -0VK5Z1BfUHUYq4PoBYNLw Nightlife
-0ZDRoepf3gwd9fpsw1bRQ 113956 -0ZDRoepf3gwd9fpsw1bRQ Beauty & Spas
-0bUDim5OGuv8R0Qqq6J4A 59609 -0bUDim5OGuv8R0Qqq6J4A Restaurants
-0bl9EllYlei__4dl1W00Q 60098 -0bl9EllYlei__4dl1W00Q Restaurants
... ... ... ...
zxzoq5E2-r4e26kXgoierg 85783 zxzoq5E2-r4e26kXgoierg Restaurants
zyCrtvCPdSZDhUrH86vbeA 96900 zyCrtvCPdSZDhUrH86vbeA Performing Arts
zyD03W9T-raaNmoEdodFQA 20120 zyD03W9T-raaNmoEdodFQA Shopping
zyFBiG_6CkXnGOQwLjD4sg 9973 zyFBiG_6CkXnGOQwLjD4sg Active Life
zyFZme7L_6kfvex2sgPfCA 64092 zyFZme7L_6kfvex2sgPfCA Restaurants
zyI8_HQPEwkJ22WMb9ILVQ 53231 zyI8_HQPEwkJ22WMb9ILVQ Restaurants
zyIELEt6zTdBvGgxReq86A 67828 zyIELEt6zTdBvGgxReq86A Restaurants
zyV7n8eXVTAtJrryRswHvA 67526 zyV7n8eXVTAtJrryRswHvA Restaurants
zyYRWnimLbQt9aCCI_2MZw 113804 zyYRWnimLbQt9aCCI_2MZw Beauty & Spas
zyZ883vct64bKzNv4wy6Yg 78310 zyZ883vct64bKzNv4wy6Yg Local Services
zyjyDSdyw0eUqxAWnM_fdA 2806 zyjyDSdyw0eUqxAWnM_fdA Health & Medical
zypmytGA92yk45AYn60ZHw 24326 zypmytGA92yk45AYn60ZHw Shopping
zyvujnBGGpG3WWiDPkil9w 85761 zyvujnBGGpG3WWiDPkil9w Restaurants
zz4HioU_qObV2kuOsIMUhw 88506 zz4HioU_qObV2kuOsIMUhw Restaurants
zz5eC4ux3D8AOrKKYCU0YA 12903 zz5eC4ux3D8AOrKKYCU0YA Shopping
zzIP8b0fpMEacKp5KzuLrA 18457 zzIP8b0fpMEacKp5KzuLrA Shopping
zzK7j06Pg8YWkQHzgsTuQA 7386 zzK7j06Pg8YWkQHzgsTuQA Nightlife
zzUUj9BUNn0_0F5hwy7RRw 81744 zzUUj9BUNn0_0F5hwy7RRw Restaurants
zzW_Gu-O5UpieKKp0Vd_bQ 108872 zzW_Gu-O5UpieKKp0Vd_bQ Shopping
zzZizlQn7Wfi-tFG9OiJ9w 74369 zzZizlQn7Wfi-tFG9OiJ9w Event Planning & Services
zze1eI588TqdJLs8xOzMxw 64397 zze1eI588TqdJLs8xOzMxw Restaurants
zzgXEteMduykqN8JAUG4CA 29137 zzgXEteMduykqN8JAUG4CA Restaurants
zzhSCOVTZ7MUKOAfwT-Bsw 11596 zzhSCOVTZ7MUKOAfwT-Bsw Shopping
zzhykRiQh2FyrYEPMfBw0A 156452 zzhykRiQh2FyrYEPMfBw0A Shopping
zzhzW3DImwhSwpcK1Yuy7Q 60067 zzhzW3DImwhSwpcK1Yuy7Q Restaurants
zzlpdQ2GPcXj7mo0lut3Vg 150444 zzlpdQ2GPcXj7mo0lut3Vg Shopping
zzrm5HEoYKEsfdi8XxSXuQ 388 zzrm5HEoYKEsfdi8XxSXuQ Health & Medical
zzs-gtrsGyu2_o8L6vpaIQ 79546 zzs-gtrsGyu2_o8L6vpaIQ Health & Medical
zztd3r04hvMTZh3ol7VC4g 69800 zztd3r04hvMTZh3ol7VC4g Restaurants
zzucnDH33KlmbkI4rLyQTA 11755 zzucnDH33KlmbkI4rLyQTA Shopping

60667 rows × 2 columns


In [19]:
business_cat_dedup = business_new_cat[['business_id','Category']].drop_duplicates()

In [20]:
business_cat_dedup[business_cat_dedup['business_id'] == 'rgSSCx0tb9AQyCkAvMSIoQ']


Out[20]:
business_id Category
business_id
rgSSCx0tb9AQyCkAvMSIoQ 170917 rgSSCx0tb9AQyCkAvMSIoQ Restaurants

In [77]:
#TESTING UNSTACK
business_cat_dedup.unstack(level=0)


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-77-d8f861a8bd0b> in <module>()
----> 1 business_cat_dedup.unstack(level=0)

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in unstack(self, level)
   3603         """
   3604         from pandas.core.reshape import unstack
-> 3605         return unstack(self, level)
   3606 
   3607     #----------------------------------------------------------------------

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in unstack(obj, level)
    396     if isinstance(obj, DataFrame):
    397         if isinstance(obj.index, MultiIndex):
--> 398             return _unstack_frame(obj, level)
    399         else:
    400             return obj.T.stack(dropna=False)

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in _unstack_frame(obj, level)
    436     else:
    437         unstacker = _Unstacker(obj.values, obj.index, level=level,
--> 438                                value_columns=obj.columns)
    439         return unstacker.get_result()
    440 

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in __init__(self, values, index, level, value_columns)
     96 
     97         self._make_sorted_values_labels()
---> 98         self._make_selectors()
     99 
    100     def _make_sorted_values_labels(self):

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in _make_selectors(self)
    130 
    131         selector = self.sorted_labels[-1] + stride * comp_index + self.lift
--> 132         mask = np.zeros(np.prod(self.full_shape), dtype=bool)
    133         mask.put(selector, True)
    134 

MemoryError: 

In [21]:
business = business.merge(ytract)

In [23]:
business = business.merge(business_cat_dedup)

In [25]:
business.head(1)


Out[25]:
attributes business_id categories city full_address hours latitude longitude name neighborhoods open review_count stars state type GISJOIN Category
0 {u'By Appointment Only': True} vcNAWiLM4dR7D2nwwJ7nCA [Doctors, Health & Medical] Phoenix 4840 E Indian School Rd\nSte 101\nPhoenix, AZ ... {u'Thursday': {u'close': u'17:00', u'open': u'... 33.499313 -111.983758 Eric Goldberg, MD [] True 9 3.5 AZ business G0400130108200 Health & Medical

In [26]:
cat_count = business[['business_id','Category']].groupby(['Category']).count()

In [27]:
cat_count


Out[27]:
business_id
Category
Active Life 1952
Arts & Crafts 89
Automotive 2889
Bars 6
Beauty & Spas 4048
Education 255
Event Planning & Services 1418
Financial Services 407
Health & Medical 2519
Home Services 2759
Hotels 306
Local Services 1968
Nightlife 1520
Performing Arts 589
Pets 1080
Professional Services 107
Public Services & Government 100
Religious Organizations 138
Restaurants 22828
Shopping 7443

In [28]:
cat_count.sort('business_id',ascending=False).plot(kind='bar')


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f13f3f43310>

In [29]:
business.head(1)


Out[29]:
attributes business_id categories city full_address hours latitude longitude name neighborhoods open review_count stars state type GISJOIN Category
0 {u'By Appointment Only': True} vcNAWiLM4dR7D2nwwJ7nCA [Doctors, Health & Medical] Phoenix 4840 E Indian School Rd\nSte 101\nPhoenix, AZ ... {u'Thursday': {u'close': u'17:00', u'open': u'... 33.499313 -111.983758 Eric Goldberg, MD [] True 9 3.5 AZ business G0400130108200 Health & Medical

In [78]:
reviews.head(1)


Out[78]:
business_id date review_id stars text type user_id votes
0 vcNAWiLM4dR7D2nwwJ7nCA 2007-05-17 15SdjuK7DmYqUAj6rjGowg 5 dr. goldberg offers everything i look for in a... review Xqd0DzHaiyRqVH3WRG7hzg {u'funny': 0, u'useful': 2, u'cool': 1}

In [30]:
business[['business_id','GISJOIN','Category','open']].merge(reviews[['business_id', 'date','stars','useful']])


Out[30]:
business_id GISJOIN Category open date stars useful
0 vcNAWiLM4dR7D2nwwJ7nCA G0400130108200 Health & Medical True 2007-05-17 5 2
1 vcNAWiLM4dR7D2nwwJ7nCA G0400130108200 Health & Medical True 2010-03-22 2 2
2 vcNAWiLM4dR7D2nwwJ7nCA G0400130108200 Health & Medical True 2012-02-14 4 1
3 vcNAWiLM4dR7D2nwwJ7nCA G0400130108200 Health & Medical True 2012-03-02 4 0
4 vcNAWiLM4dR7D2nwwJ7nCA G0400130108200 Health & Medical True 2012-05-15 4 2
5 vcNAWiLM4dR7D2nwwJ7nCA G0400130108200 Health & Medical True 2013-04-19 1 0
6 vcNAWiLM4dR7D2nwwJ7nCA G0400130108200 Health & Medical True 2014-01-02 5 0
7 vcNAWiLM4dR7D2nwwJ7nCA G0400130108200 Health & Medical True 2014-01-08 5 0
8 vcNAWiLM4dR7D2nwwJ7nCA G0400130108200 Health & Medical True 2014-08-01 1 1
9 vcNAWiLM4dR7D2nwwJ7nCA G0400130108200 Health & Medical True 2014-12-12 5 0
10 UsFtqoBl7naz8AVUBZMjQQ G4200030487000 Nightlife True 2013-11-08 5 0
11 UsFtqoBl7naz8AVUBZMjQQ G4200030487000 Nightlife True 2014-03-29 3 0
12 UsFtqoBl7naz8AVUBZMjQQ G4200030487000 Nightlife True 2014-10-29 2 0
13 UsFtqoBl7naz8AVUBZMjQQ G4200030487000 Nightlife True 2014-11-28 4 1
14 cE27W9VPgO88Qxe4ol6y_g G4200030475401 Shopping False 2012-08-19 3 1
15 cE27W9VPgO88Qxe4ol6y_g G4200030475401 Shopping False 2013-04-18 1 1
16 cE27W9VPgO88Qxe4ol6y_g G4200030475401 Shopping False 2013-07-14 1 0
17 cE27W9VPgO88Qxe4ol6y_g G4200030475401 Shopping False 2013-08-16 4 0
18 cE27W9VPgO88Qxe4ol6y_g G4200030475401 Shopping False 2014-07-11 4 0
19 HZdLhv6COCleJMo7nPl-RA G4200030474101 Shopping True 2013-06-10 5 7
20 HZdLhv6COCleJMo7nPl-RA G4200030474101 Shopping True 2013-12-23 2 1
21 HZdLhv6COCleJMo7nPl-RA G4200030474101 Shopping True 2014-09-04 4 0
22 mVHrayjG3uZ_RLHkLj-AMg G4200030512900 Restaurants True 2012-12-01 5 5
23 mVHrayjG3uZ_RLHkLj-AMg G4200030512900 Restaurants True 2013-03-15 5 0
24 mVHrayjG3uZ_RLHkLj-AMg G4200030512900 Restaurants True 2013-03-30 5 2
25 mVHrayjG3uZ_RLHkLj-AMg G4200030512900 Restaurants True 2013-10-20 4 1
26 mVHrayjG3uZ_RLHkLj-AMg G4200030512900 Restaurants True 2013-11-07 5 0
27 mVHrayjG3uZ_RLHkLj-AMg G4200030512900 Restaurants True 2014-03-22 5 2
28 mVHrayjG3uZ_RLHkLj-AMg G4200030512900 Restaurants True 2014-09-29 4 0
29 mVHrayjG3uZ_RLHkLj-AMg G4200030512900 Restaurants True 2014-09-29 5 0
... ... ... ... ... ... ... ...
1488598 7gmzKhAr6-A0DCGT4HN-ig G3200030006700 Restaurants True 2014-06-11 4 0
1488599 7gmzKhAr6-A0DCGT4HN-ig G3200030006700 Restaurants True 2014-12-24 4 0
1488600 7gmzKhAr6-A0DCGT4HN-ig G3200030006700 Restaurants True 2014-12-24 4 4
1488601 7gmzKhAr6-A0DCGT4HN-ig G3200030006700 Restaurants True 2014-12-28 2 0
1488602 7gmzKhAr6-A0DCGT4HN-ig G3200030006700 Restaurants True 2014-12-31 1 1
1488603 7gmzKhAr6-A0DCGT4HN-ig G3200030006700 Restaurants True 2015-01-01 4 0
1488604 7gmzKhAr6-A0DCGT4HN-ig G3200030006700 Restaurants True 2015-01-04 5 0
1488605 7gmzKhAr6-A0DCGT4HN-ig G3200030006700 Restaurants True 2015-01-05 5 0
1488606 7gmzKhAr6-A0DCGT4HN-ig G3200030006700 Restaurants True 2015-01-06 4 0
1488607 7gmzKhAr6-A0DCGT4HN-ig G3200030006700 Restaurants True 2015-01-08 2 0
1488608 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2012-10-02 5 3
1488609 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2013-02-05 5 1
1488610 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2013-03-22 5 1
1488611 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2013-04-03 5 1
1488612 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2013-06-10 5 0
1488613 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2013-06-13 5 0
1488614 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2013-06-18 5 1
1488615 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2013-07-12 5 0
1488616 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2013-09-22 5 0
1488617 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2013-11-19 5 0
1488618 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2014-01-15 5 0
1488619 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2014-03-26 5 1
1488620 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2014-03-28 5 0
1488621 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2014-04-01 5 0
1488622 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2014-04-04 1 2
1488623 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2014-04-24 5 0
1488624 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2014-04-29 1 2
1488625 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2014-05-14 5 0
1488626 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2014-06-03 5 0
1488627 6TPxhpHqFedjMvBuw6pF3w G3200030005822 Home Services True 2014-06-25 1 8

1488628 rows × 7 columns


In [31]:
geo_review = business[['business_id','GISJOIN','Category','open']].merge(reviews[['business_id', 'date','stars','useful']])

In [ ]:
#review_cats = reviews[['business_id', 'stars', 'useful']].merge(cats)

In [37]:
business_review_years = geo_review[['business_id','date']].groupby('business_id').agg({'date': [np.min,np.max]})

In [69]:
business_review_years


Out[69]:
date years
amin amax
business_id
--1emggGHgoG6ipd_RMb-g 2013-03-30 2014-06-13 NaN
--4Pe8BZ6gj57VFL5mUE8g 2009-01-13 2014-07-31 NaN
--5jkZ3-nUPZxUvtcbr8Uw 2011-07-31 2014-12-27 NaN
--7PRjnsjMA6uhPK8mW13Q 2010-10-22 2013-11-17 NaN
--BlvDO_RG2yElKu9XA1_g 2011-02-13 2014-12-04 NaN
--Dl2rW_xO8GuYBomlg9zw 2010-05-04 2012-10-31 NaN
--Ol5mVSMaW8ExtmWRUmKA 2010-04-10 2014-11-17 NaN
--UE_y6auTgq3FXlvUMkbw 2007-09-10 2015-01-08 NaN
--WLj8iK9DUKxkPnMHAoeA 2013-02-25 2014-11-18 NaN
--XBxRlD92RaV6TyUnP8Ow 2009-04-22 2014-09-27 NaN
--Y_2lDOtVDioX5bwF6GIw 2012-12-06 2014-11-20 NaN
--jFTZmywe7StuZ2hEjxyA 2012-10-24 2014-08-11 NaN
--nQiUBxtpjd_ZBuO_PH6w 2014-09-25 2014-11-05 NaN
--pOlFxITWnhzc7SHSIP0A 2013-03-18 2015-01-08 NaN
--qeSYxyn62mMjWvznNTdg 2011-05-14 2013-11-02 NaN
--sSW-WY3vyASh_eVPGUAw 2010-09-09 2014-04-30 NaN
--zgHBiQpr8H2ZqSdGmguQ 2009-04-10 2011-12-11 NaN
-05qMWbhAtrD6EiV-UElPg 2012-03-24 2014-07-31 NaN
-0Ackw6MF82PXO9f9Jh_Kg 2009-05-24 2014-03-24 NaN
-0D_CYhlD2ILkmLR0pBmnA 2011-07-29 2014-06-06 NaN
-0GkcDiIgVm0XzDZC8RFOg 2006-04-21 2014-11-30 NaN
-0HGqwlfw3I8nkJyMHxAsQ 2014-02-10 2014-12-01 NaN
-0QBrNvhrPQCaeo7mTo0zQ 2011-06-26 2012-11-29 NaN
-0VK5Z1BfUHUYq4PoBYNLw 2013-11-08 2014-10-24 NaN
-0ZDRoepf3gwd9fpsw1bRQ 2009-05-07 2014-12-06 NaN
-0bUDim5OGuv8R0Qqq6J4A 2010-03-16 2012-12-03 NaN
-0lOuL7RkZQnjAl96dXTvA 2014-05-16 2014-12-16 NaN
-0lRv242OzyPUB8c5E_UhQ 2011-06-07 2014-03-13 NaN
-0xwm_AwS0d_5rwUprHGuw 2008-07-04 2014-11-16 NaN
-1B-DEGkLE1kDj5ENAF2NQ 2011-06-23 2015-01-04 NaN
... ... ... ...
zxjMiqlfUuIQRltSuj9FpQ 2009-05-13 2014-12-19 NaN
zxo9nZwhlW1ZM7OUJP_1ag 2011-07-25 2014-10-16 NaN
zxodsuIjtv3kx8mNa3sptQ 2008-08-23 2014-12-27 NaN
zxqvU415r_RtZRKDtdbIKQ 2006-01-11 2015-01-07 NaN
zxviVY0ypMZMkZDfE0Ffqw 2012-09-11 2015-01-05 NaN
zxzoq5E2-r4e26kXgoierg 2014-01-24 2014-12-31 NaN
zyCrtvCPdSZDhUrH86vbeA 2009-03-03 2014-07-31 NaN
zyD03W9T-raaNmoEdodFQA 2014-06-08 2014-06-19 NaN
zyFZme7L_6kfvex2sgPfCA 2010-12-23 2011-09-15 NaN
zyI8_HQPEwkJ22WMb9ILVQ 2007-07-31 2014-12-01 NaN
zyIELEt6zTdBvGgxReq86A 2013-12-01 2014-06-08 NaN
zyV7n8eXVTAtJrryRswHvA 2013-07-03 2015-01-06 NaN
zyYRWnimLbQt9aCCI_2MZw 2010-04-02 2012-10-15 NaN
zyZ883vct64bKzNv4wy6Yg 2009-10-01 2014-12-30 NaN
zyjyDSdyw0eUqxAWnM_fdA 2013-04-15 2013-09-17 NaN
zypmytGA92yk45AYn60ZHw 2011-05-22 2014-08-12 NaN
zyvujnBGGpG3WWiDPkil9w 2013-11-21 2015-01-02 NaN
zz4HioU_qObV2kuOsIMUhw 2011-03-02 2014-03-10 NaN
zz5eC4ux3D8AOrKKYCU0YA 2010-05-22 2014-12-07 NaN
zzIP8b0fpMEacKp5KzuLrA 2011-07-05 2014-11-09 NaN
zzK7j06Pg8YWkQHzgsTuQA 2011-10-08 2014-11-30 NaN
zzW_Gu-O5UpieKKp0Vd_bQ 2012-10-14 2014-09-22 NaN
zzZizlQn7Wfi-tFG9OiJ9w 2011-02-26 2011-05-22 NaN
zze1eI588TqdJLs8xOzMxw 2011-04-05 2012-01-30 NaN
zzgXEteMduykqN8JAUG4CA 2012-12-09 2014-12-30 NaN
zzhSCOVTZ7MUKOAfwT-Bsw 2011-04-03 2013-05-01 NaN
zzhykRiQh2FyrYEPMfBw0A 2011-05-29 2014-10-13 NaN
zzrm5HEoYKEsfdi8XxSXuQ 2010-10-12 2014-06-16 NaN
zzs-gtrsGyu2_o8L6vpaIQ 2010-11-04 2014-11-06 NaN
zzucnDH33KlmbkI4rLyQTA 2008-11-05 2014-12-29 NaN

52392 rows × 3 columns


In [ ]:
review_cats['wstars'] = review_cats.stars * (review_cats.useful + 1)

In [73]:
#reviews['useful'] = reviews['votes'].apply(lambda v: v['useful'])
business_review_years['years'] = business_review_years[['date']].stack(level=0)[['amax']]
#business_review_years[['date']].stack(level=0)[['amin','amax']]
#business_review_years.stack(level=0)[['amin']]


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-73-5d1199b9217a> in <module>()
      1 #reviews['useful'] = reviews['votes'].apply(lambda v: v['useful'])
----> 2 business_review_years['years'] = business_review_years[['date']].stack(level=0)[['amax']]
      3 #business_review_years[['date']].stack(level=0)[['amin','amax']]
      4 #business_review_years.stack(level=0)[['amin']]

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __setitem__(self, key, value)
   2125         else:
   2126             # set column
-> 2127             self._set_item(key, value)
   2128 
   2129     def _setitem_slice(self, key, value):

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _set_item(self, key, value)
   2202 
   2203         self._ensure_valid_index(value)
-> 2204         value = self._sanitize_column(key, value)
   2205         NDFrame._set_item(self, key, value)
   2206 

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _sanitize_column(self, key, value)
   2351                         value = value.reindex_axis(cols, axis=1)
   2352             # now align rows
-> 2353             value = reindexer(value).T
   2354 
   2355         elif isinstance(value, Categorical):

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in reindexer(value)
   2334 
   2335                     # other
-> 2336                     raise TypeError('incompatible index of inserted column '
   2337                                     'with frame index')
   2338             return value

TypeError: incompatible index of inserted column with frame index

In [66]:
business_review_years


Out[66]:
date years
amin amax
business_id
--1emggGHgoG6ipd_RMb-g 2013-03-30 2014-06-13 NaN
--4Pe8BZ6gj57VFL5mUE8g 2009-01-13 2014-07-31 NaN
--5jkZ3-nUPZxUvtcbr8Uw 2011-07-31 2014-12-27 NaN
--7PRjnsjMA6uhPK8mW13Q 2010-10-22 2013-11-17 NaN
--BlvDO_RG2yElKu9XA1_g 2011-02-13 2014-12-04 NaN
--Dl2rW_xO8GuYBomlg9zw 2010-05-04 2012-10-31 NaN
--Ol5mVSMaW8ExtmWRUmKA 2010-04-10 2014-11-17 NaN
--UE_y6auTgq3FXlvUMkbw 2007-09-10 2015-01-08 NaN
--WLj8iK9DUKxkPnMHAoeA 2013-02-25 2014-11-18 NaN
--XBxRlD92RaV6TyUnP8Ow 2009-04-22 2014-09-27 NaN
--Y_2lDOtVDioX5bwF6GIw 2012-12-06 2014-11-20 NaN
--jFTZmywe7StuZ2hEjxyA 2012-10-24 2014-08-11 NaN
--nQiUBxtpjd_ZBuO_PH6w 2014-09-25 2014-11-05 NaN
--pOlFxITWnhzc7SHSIP0A 2013-03-18 2015-01-08 NaN
--qeSYxyn62mMjWvznNTdg 2011-05-14 2013-11-02 NaN
--sSW-WY3vyASh_eVPGUAw 2010-09-09 2014-04-30 NaN
--zgHBiQpr8H2ZqSdGmguQ 2009-04-10 2011-12-11 NaN
-05qMWbhAtrD6EiV-UElPg 2012-03-24 2014-07-31 NaN
-0Ackw6MF82PXO9f9Jh_Kg 2009-05-24 2014-03-24 NaN
-0D_CYhlD2ILkmLR0pBmnA 2011-07-29 2014-06-06 NaN
-0GkcDiIgVm0XzDZC8RFOg 2006-04-21 2014-11-30 NaN
-0HGqwlfw3I8nkJyMHxAsQ 2014-02-10 2014-12-01 NaN
-0QBrNvhrPQCaeo7mTo0zQ 2011-06-26 2012-11-29 NaN
-0VK5Z1BfUHUYq4PoBYNLw 2013-11-08 2014-10-24 NaN
-0ZDRoepf3gwd9fpsw1bRQ 2009-05-07 2014-12-06 NaN
-0bUDim5OGuv8R0Qqq6J4A 2010-03-16 2012-12-03 NaN
-0lOuL7RkZQnjAl96dXTvA 2014-05-16 2014-12-16 NaN
-0lRv242OzyPUB8c5E_UhQ 2011-06-07 2014-03-13 NaN
-0xwm_AwS0d_5rwUprHGuw 2008-07-04 2014-11-16 NaN
-1B-DEGkLE1kDj5ENAF2NQ 2011-06-23 2015-01-04 NaN
... ... ... ...
zxjMiqlfUuIQRltSuj9FpQ 2009-05-13 2014-12-19 NaN
zxo9nZwhlW1ZM7OUJP_1ag 2011-07-25 2014-10-16 NaN
zxodsuIjtv3kx8mNa3sptQ 2008-08-23 2014-12-27 NaN
zxqvU415r_RtZRKDtdbIKQ 2006-01-11 2015-01-07 NaN
zxviVY0ypMZMkZDfE0Ffqw 2012-09-11 2015-01-05 NaN
zxzoq5E2-r4e26kXgoierg 2014-01-24 2014-12-31 NaN
zyCrtvCPdSZDhUrH86vbeA 2009-03-03 2014-07-31 NaN
zyD03W9T-raaNmoEdodFQA 2014-06-08 2014-06-19 NaN
zyFZme7L_6kfvex2sgPfCA 2010-12-23 2011-09-15 NaN
zyI8_HQPEwkJ22WMb9ILVQ 2007-07-31 2014-12-01 NaN
zyIELEt6zTdBvGgxReq86A 2013-12-01 2014-06-08 NaN
zyV7n8eXVTAtJrryRswHvA 2013-07-03 2015-01-06 NaN
zyYRWnimLbQt9aCCI_2MZw 2010-04-02 2012-10-15 NaN
zyZ883vct64bKzNv4wy6Yg 2009-10-01 2014-12-30 NaN
zyjyDSdyw0eUqxAWnM_fdA 2013-04-15 2013-09-17 NaN
zypmytGA92yk45AYn60ZHw 2011-05-22 2014-08-12 NaN
zyvujnBGGpG3WWiDPkil9w 2013-11-21 2015-01-02 NaN
zz4HioU_qObV2kuOsIMUhw 2011-03-02 2014-03-10 NaN
zz5eC4ux3D8AOrKKYCU0YA 2010-05-22 2014-12-07 NaN
zzIP8b0fpMEacKp5KzuLrA 2011-07-05 2014-11-09 NaN
zzK7j06Pg8YWkQHzgsTuQA 2011-10-08 2014-11-30 NaN
zzW_Gu-O5UpieKKp0Vd_bQ 2012-10-14 2014-09-22 NaN
zzZizlQn7Wfi-tFG9OiJ9w 2011-02-26 2011-05-22 NaN
zze1eI588TqdJLs8xOzMxw 2011-04-05 2012-01-30 NaN
zzgXEteMduykqN8JAUG4CA 2012-12-09 2014-12-30 NaN
zzhSCOVTZ7MUKOAfwT-Bsw 2011-04-03 2013-05-01 NaN
zzhykRiQh2FyrYEPMfBw0A 2011-05-29 2014-10-13 NaN
zzrm5HEoYKEsfdi8XxSXuQ 2010-10-12 2014-06-16 NaN
zzs-gtrsGyu2_o8L6vpaIQ 2010-11-04 2014-11-06 NaN
zzucnDH33KlmbkI4rLyQTA 2008-11-05 2014-12-29 NaN

52392 rows × 3 columns


In [ ]:
stars_by_geocat = review_cats[['stars']].groupby([review_cats.category, review_cats.GISJOIN]).mean()
stars_by_geocat = stars_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=99)
stars_by_geocat.columns = stars_by_geocat.columns.get_level_values(1)
stars_by_geocat.reset_index(inplace=True)
stars_by_geocat.to_csv('stars_by_geocat.csv')

wstars_by_geocat = review_cats[['wstars']].groupby([review_cats.category, review_cats.GISJOIN]).mean()
wstars_by_geocat = wstars_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=99)
wstars_by_geocat.columns = wstars_by_geocat.columns.get_level_values(1)
wstars_by_geocat.reset_index(inplace=True)
wstars_by_geocat.to_csv('wstars_by_geocat.csv')

counts_by_geocat = cats[['business_id']].groupby([cats.category, cats.GISJOIN]).count()
counts_by_geocat.columns = ['business_count']
counts_by_geocat = counts_by_geocat.unstack(level=0).dropna(how='all').dropna(axis=1, thresh=99)
counts_by_geocat.columns = counts_by_geocat.columns.get_level_values(1)
counts_by_geocat.reset_index(inplace=True)
counts_by_geocat.to_csv('counts_by_geocat.csv')

In [ ]:
data = pandas.read_csv('/home/russ/Documents/DDL/ACS-5/nhgis0002_csv/nhgis0002_ds201_20135_2013_tract.csv', skiprows=[1])

In [ ]:
data['pc_income'] = data['UJAE001']
data['employed_percent'] = data['UJ8E002'] / data['UJ8E001']
data['median_age'] = data['UEFE001']
data['median_gross_rent'] = data['UL9E001']
data['median_family_income'] = data['UINE001']
econ = data[['GISJOIN', 'pc_income', 'employed_percent', 'median_age', 'median_gross_rent', 'median_family_income']]

In [ ]:
wcorrs = econ.merge(wstars_by_geocat).corr().unstack().reset_index()
wcorrs.columns = ['econ', 'cat', 'r']
wcorrs = wcorrs[wcorrs.econ.apply(lambda e: e.islower())]
wcorrs = wcorrs[wcorrs.cat.apply(lambda e: not e.islower())]
wcorrs[wcorrs.r.apply(lambda v: abs(v) > .3)]

In [ ]:
scorrs = econ.merge(stars_by_geocat).corr().unstack().reset_index()
scorrs.columns = ['econ', 'cat', 'r']
scorrs = scorrs[scorrs.econ.apply(lambda e: e.islower())]
scorrs = scorrs[scorrs.cat.apply(lambda e: not e.islower())]
scorrs[scorrs.r.apply(lambda v: abs(v) > .3)]

In [ ]:
ccorrs = econ.merge(counts_by_geocat).corr().unstack().reset_index()
ccorrs.columns = ['econ', 'cat', 'r']
ccorrs = ccorrs[ccorrs.econ.apply(lambda e: e.islower())]
ccorrs = ccorrs[ccorrs.cat.apply(lambda e: not e.islower())]
ccorrs[ccorrs.r.apply(lambda v: abs(v) > .3)]

In [ ]: