In [1]:
import pandas as pd
with open('./yelp_academic_dataset_review.json', 'rb') as f:
data = f.readlines()
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
rw = pd.read_json(data_json_str)
In [72]:
all_chinese = pd.read_csv("./yelp-challenge/LDA/ALL_Chinese_Business_ID.csv")
all_chinese.head(3)
Out[72]:
In [ ]:
all_rw_chinese = all_chinese.merge(rw, how='left', left_index=True, right_on='business_id')
print len(rw)
print len(all_chinese)
print len(all_rw_chinese)
In [24]:
test = all_rw_chinese.copy()
del test['business_id_x']
del test['business_id_y']
test.set_index(inplace=True, keys=test.business_id.values)
output = test.groupby(test['business_id']).apply(lambda x: '. '.join(x.text))
output2 = pd.DataFrame(output, index=output.index.values)
output2.rename(columns={0:'text'}, inplace=True)
output2['business_id'] = output2.index.values
output2.reset_index(drop=True, inplace=True)
print len(output)
output2.head(10)
In [25]:
output2.to_csv("./all_chinese_rw.csv", index_label=False, encoding='utf-8')