0.

  • Directory
  • Business_id for other cusine types

1.Load 'review'


In [1]:
import pandas as pd

with open('./yelp_academic_dataset_review.json', 'rb') as f:
    data = f.readlines()
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
rw = pd.read_json(data_json_str)

2.Load 'cusine' (eg.Chinese Cusine')


In [74]:
all_jpn = pd.read_csv("./ALL_Japanese_Business_ID.csv")
all_jpn.head(3)


Out[74]:
business_id
Eq3qA7F5uZBUbcYXROzntA Eq3qA7F5uZBUbcYXROzntA
Ld2hhA3q3cdkptwS1fsYEg Ld2hhA3q3cdkptwS1fsYEg
tGBeFfwXCUZOsb0YWiMWIA tGBeFfwXCUZOsb0YWiMWIA

3. Merge 1&2


In [75]:
all_rw_jpn = all_jpn.merge(rw, how='left', left_index=True, right_on='business_id')
print len(rw)
print len(all_jpn)
print len(all_rw_jpn)


4153150
1625
185275

4. Merge 'reviews' for each 'business_id'


In [76]:
test = all_rw_jpn.copy()
del test['business_id_x']
del test['business_id_y']
test.set_index(inplace=True, keys=test.business_id.values)
output = test.groupby(test['business_id']).apply(lambda x: '. '.join(x.text))
output2 = pd.DataFrame(output, index=output.index.values)
output2.rename(columns={0:'text'}, inplace=True)
output2['business_id'] = output2.index.values
output2.reset_index(drop=True, inplace=True)
print len(output)
output2.head(10)


1625
Out[76]:
text business_id
0 Came here for lunch \nOrdered combination bent... -4bPFENRdTqjML8aKEL6ow
1 Place was okay, came here because we couldn't ... -6mzdR0YjOToJ8E04Y9O0Q
2 Prices have gone up, quality has gone down. Su... -AVRReI-nfsa0lKlehEojw
3 After unsuccessfully trying the other location... -BbnAc9YEO6pjvJGEtFbVQ
4 UPDATE: Went back!!! Kuddos to the owners! Had... -DnaKAs2oK3rXfrjSvn9ew
5 New at red Rock casino. A bit overpriced. Slow... -FcZY7a7qgxTUlTvwuyJnQ
6 Maybe I am a sushi snob because we have amazin... -FyvAo_bNe6eXWpEHpSUrQ
7 You should be able to give 0 stars when you ba... -GOsHrWPC0meDRgkCEgC8w
8 First thing I noticed when I walked in was the... -ITj6Pu8Gdw8MmLf0XBEKQ
9 Hands down best sushi I've eaten in Toronto. E... -Ipm_8YXj9UoNNHagCvsNg

5.output


In [77]:
output2.to_csv("./all_jpn_rw.csv", index_label=False, encoding='utf-8')