In [1]:
import pandas as pd

In [ ]:
table_names = ['business', 'review', 'user', 'checkin', 'tip']
original_csv_filepath = '/home/amlanlimaye/yelp-dataset-challenge/data/interim/original_csv/'
    
for tbl_name in table_names:
    globals()[tbl_name] = pd.read_csv(original_csv_filepath + "{}".format(tbl_name))

In [5]:
US_business = business.loc[business['state'].isin(['AZ', 'NV', 'NC', 'PA', 'WI', 'IL', 'SC', 'OH'])]


Out[5]:
address attributes business_id categories city hours is_open latitude longitude name neighborhood postal_code review_count stars state type
0 227 E Baseline Rd, Ste J2 [BikeParking: True, BusinessAcceptsBitcoin: Fa... 0DI8Dt2PJp07XkVvIElIcQ [Tobacco Shops, Nightlife, Vape Shops, Shopping] Tempe [Monday 11:0-21:0, Tuesday 11:0-21:0, Wednesda... 0 33.378214 -111.936102 Innovative Vapors NaN 85283 17 4.5 AZ business
1 495 S Grand Central Pkwy [BusinessAcceptsBitcoin: False, BusinessAccept... LTlCaCGZE14GuaUXUGbamg [Caterers, Grocery, Food, Event Planning & Ser... Las Vegas [Monday 0:0-0:0, Tuesday 0:0-0:0, Wednesday 0:... 1 36.192284 -115.159272 Cut and Taste NaN 89106 9 5.0 NV business
3 7014 Steubenville Pike [AcceptsInsurance: False, BusinessAcceptsCredi... cnGIivYRLxpF7tBVR_JwWA [Hair Removal, Beauty & Spas, Blow Dry/Out Ser... Oakdale [Tuesday 10:0-21:0, Wednesday 10:0-21:0, Thurs... 1 40.444544 -80.174540 Plush Salon and Spa NaN 15071 4 4.0 PA business
6 10875 N Frankloyd Wright Blvd [BikeParking: True, BusinessAcceptsCreditCards... Cu4_Fheh7IrzGiK-Pc79ig [Baby Gear & Furniture, Shopping] Scottsdale [Monday 10:0-18:0, Tuesday 10:0-18:0, Wednesda... 1 33.585271 -111.834954 Boomerang Baby NaN 85259 8 3.5 AZ business
7 11072 No Frank Lloyd Wright [Alcohol: none, Ambience: {'romantic': False, ... GDnbt3isfhd57T1QqU6flg [Tex-Mex, Mexican, Fast Food, Restaurants] Scottsdale [Monday 10:0-22:0, Tuesday 10:0-22:0, Wednesda... 1 33.586710 -111.835410 Taco Bell NaN 85259 9 2.5 AZ business

In [6]:
US_review = review.loc[review['business_id'].isin(US_business.business_id.unique())]


Out[6]:
business_id cool date funny review_id stars text type useful user_id
0 2aFiy99vNLklCx3T_tGS9A 0 2011-10-10 0 NxL8SIC5yqOdnlXCg18IBg 5.0 If you enjoy service by someone who is as comp... review 0.0 KpkOkG6RIf4Ra25Lhhxf1A
1 2aFiy99vNLklCx3T_tGS9A 0 2010-12-29 0 pXbbIgOXvLuTi_SPs1hQEQ 5.0 After being on the phone with Verizon Wireless... review 1.0 bQ7fQq1otn9hKX-gXRsrgA
2 2aFiy99vNLklCx3T_tGS9A 0 2011-04-29 0 wslW2Lu4NYylb1jEapAGsw 5.0 Great service! Corey is very service oriented.... review 0.0 r1NUhdNmL6yU9Bn-Yx6FTw
3 2LfIuF3_sX6uwe-IR-P0jQ 1 2014-07-14 0 GP6YEearUWrzPtQYSF1vVg 5.0 Highly recommended. Went in yesterday looking ... review 0.0 aW3ix1KNZAvoM8q-WghA3Q
4 2LfIuF3_sX6uwe-IR-P0jQ 0 2014-01-15 0 25RlYGq2s5qShi-pn3ufVA 4.0 I walked in here looking for a specific piece ... review 0.0 YOo-Cip8HqvKp_p9nEGphw

In [7]:
US_tip = tip.loc[tip['business_id'].isin(US_business.business_id.unique())]


Out[7]:
business_id date likes text type user_id
2 dAa0hB2yrnHzVmsCkN4YvQ 2014-06-20 0 Nice place. Great staff. A fixture in the tow... tip oaYhjqBbh18ZhU0bpyzSuw
3 dAa0hB2yrnHzVmsCkN4YvQ 2016-10-12 0 Happy hour 5-7 Monday - Friday tip ulQ8Nyj7jCUR8M83SUMoRQ
4 SqW3igh1_Png336VIb5DUA 2016-07-03 0 Come early on Sunday's to avoid the rush tip ulQ8Nyj7jCUR8M83SUMoRQ
5 KNpcPGqDORDdvtekXd348w 2016-01-07 0 Love their soup! tip ulQ8Nyj7jCUR8M83SUMoRQ
6 KNpcPGqDORDdvtekXd348w 2016-05-22 0 Soups are fantastic! tip ulQ8Nyj7jCUR8M83SUMoRQ

In [8]:
US_checkin = checkin.loc[checkin['business_id'].isin(US_business.business_id.unique())]


Out[8]:
business_id time type
0 7KPBkxAOEtb3QeIL9PEErg [Fri-0:2, Sat-0:1, Sun-0:1, Wed-0:2, Sat-1:2, ... checkin
1 kREVIrSBbtqBhIYkTccQUg [Mon-13:1, Thu-13:1, Sat-16:1, Wed-17:1, Sun-1... checkin
3 nhZ1HGWD8lMErdn3FuWuTQ [Fri-0:1, Sat-0:1, Sun-0:1, Thu-0:1, Wed-0:1, ... checkin
4 8bY6M2yiWOF2ilfmGS34Fw [Sat-11:1, Fri-13:1, Thu-14:1] checkin
5 zNVot5_XHsxwfKdiFjk_aA [Mon-0:1, Fri-1:1, Sun-1:1, Tue-1:1, Wed-11:1,... checkin

In [9]:
US_users_list = US_tip.user_id.append(US_review.user_id).unique()

In [10]:
US_user = user.loc[user['user_id'].isin(US_users_list)]


Out[10]:
average_stars compliment_cool compliment_cute compliment_funny compliment_hot compliment_list compliment_more compliment_note compliment_photos compliment_plain ... elite fans friends funny name review_count type useful user_id yelping_since
0 3.59 4192 79 4192 3904 19 305 4705 1347 2617 ... [2017, 2015, 2016, 2014, 2011, 2013, 2012] 298 [iJg9ekPzF9lkMuvjKYX6uA, ctWAuzS04Xu0lke2Rop4l... 12316 Rob 761 user 18456 EZmocAborM6z66rTzeZxzQ 2009-09-12
1 4.29 144 11 144 64 1 4 97 24 129 ... [None] 34 [r2UUCzGxqI6WPsiWPgqG2A, qewG3X2O4X6JKskxyyqFw... 28 Vivian 80 user 117 myql3o3x22_ygECb8gVo7A 2009-06-27
2 4.15 36 1 36 14 2 10 21 4 23 ... [2011, 2016, 2017, 2013, 2014, 2015, 2012] 48 [qewG3X2O4X6JKskxyyqFwQ, -50XWnmQGqBgEI-9ANvLl... 6 Carol 841 user 58 FIk4lQQu1eTe2EpzQ4xhBA 2010-08-26
3 3.82 54 6 54 32 0 5 13 1 29 ... [2014, 2010, 2017, 2015, 2011, 2016, 2013, 201... 28 [AIhfuFmX62k7a22gXXAB2Q, kQ1jU2rDpFD_q54edEmwI... 51 Miss Maggie 376 user 42 ojovtd9c8GIeDiB8e0mq2w 2008-05-31
5 3.76 19 0 19 7 2 1 9 5 14 ... [2011, 2016, 2014, 2017, 2012, 2013, 2015] 9 [YZ4dz1QD-muThmGMYoTnjA, qewG3X2O4X6JKskxyyqFw... 0 Ramsey 194 user 5 TprC8sujz8MkwuomrqUSiw 2009-01-06

5 rows × 23 columns


In [11]:
# Writing US cities' csv files

US_business.to_csv('../../data/interim/US_cities_only/business_US.csv', encoding='utf-8', index=False)
US_review.to_csv('../../data/interim/US_cities_only/review_US.csv', encoding='utf-8', index=False)
US_tip.to_csv('../../data/interim/US_cities_only/tip_US.csv', encoding='utf-8', index=False)
US_checkin.to_csv('../../data/interim/US_cities_only/checkin_US.csv', encoding='utf-8', index=False)
US_user.to_csv('../../data/interim/US_cities_only/user_US.csv', encoding='utf-8', index=False)