In [1]:
from pymongo import MongoClient
from datetime import datetime
import json
import pdb
import csv
In [2]:
ip = '54.227.180.242'
In [4]:
conn = MongoClient(ip, 27017)
conn.database_names()
Out[4]:
In [13]:
db = conn.get_database('cleaned_data')
In [32]:
biz = db.get_collection('academic_biz')
users = db.get_collection('academic_users')
reviews = db.get_collection('academic_reviews')
The business ID field has already been filtered for only restaurants
We want to filter the users collection for the following:
1. User must have at least 20 reviews
2. For users with 20 reviews, identify the reviews which are for businesses
3. For each user, keep only those reviews which are related to a business in
the list of restaurant business IDs
4. Keep only users who have at least 20 reviews after finishing step 3
In [48]:
#Find a list of users with at least 20 reviews
user_list = []
for user in users.find():
if user['review_count'] >= 20:
user_list.append(user['_id'])
else:
pass
Create a new dictionary with the following structure and then export as a json object:
{user id: [review, review, review], ..., user id: [review, review, review]}
In [96]:
user_reviews = dict.fromkeys(user_list, 0)
for review in reviews.find():
try:
if user_reviews[review['_id']] == 0:
print review['_id']
print review
break
except KeyError:
pass
# user_reviews[review['_id']] = [review]
# else:
# user_reviews[review['_id']].append(review)
# except KeyError:
# pass
In [93]:
user_reviews[user_reviews.keys()[23]]
Out[93]:
In [86]:
filtered_reviews = {}
for user in user_reviews.keys():
if user_reviews[user] != 0:
filtered_reviews[user] = user_reviews[user]
In [87]:
#We have this many users after our filtering
len(filtered_reviews)
Out[87]:
In [37]:
#Dump file of cleaned up user data
with open('merged_user_reviews.json', 'w') as fp:
json.dump(user_reviews, fp)