0.

  • Directory
  • Business_id for other cusine types

1.Load 'review'


In [1]:
import pandas as pd

with open('./yelp_academic_dataset_review.json', 'rb') as f:
    data = f.readlines()
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
rw = pd.read_json(data_json_str)

2.Load 'cusine' (eg.Chinese Cusine')


In [72]:
all_chinese = pd.read_csv("./yelp-challenge/LDA/ALL_Chinese_Business_ID.csv")
all_chinese.head(3)


Out[72]:
business_id
q_KQbgnaYDlPx8EHTydcBQ q_KQbgnaYDlPx8EHTydcBQ
2px99IppAcnxR238eq_8_w 2px99IppAcnxR238eq_8_w
TkXbFJtFCdM_WTLkHa6Erw TkXbFJtFCdM_WTLkHa6Erw

3. Merge 1&2


In [ ]:
all_rw_chinese = all_chinese.merge(rw, how='left', left_index=True, right_on='business_id')
print len(rw)
print len(all_chinese)
print len(all_rw_chinese)

4. Merge 'reviews' for each 'business_id'


In [24]:
test = all_rw_chinese.copy()
del test['business_id_x']
del test['business_id_y']
test.set_index(inplace=True, keys=test.business_id.values)
output = test.groupby(test['business_id']).apply(lambda x: '. '.join(x.text))
output2 = pd.DataFrame(output, index=output.index.values)
output2.rename(columns={0:'text'}, inplace=True)
output2['business_id'] = output2.index.values
output2.reset_index(drop=True, inplace=True)
print len(output)
output2.head(10)

5.output


In [25]:
output2.to_csv("./all_chinese_rw.csv", index_label=False, encoding='utf-8')