In [1]:
from pymongo import MongoClient
from datetime import datetime
import json
import pdb
import csv
import tqdm

In [2]:
ip = '52.54.120.114'

In [3]:
conn = MongoClient(ip, 27017)
conn.database_names()


Out[3]:
[u'local', u'cleaned_data']

In [4]:
db = conn.get_database('cleaned_data')

In [5]:
db.collection_names()


Out[5]:
[u'dc_reviews',
 u'restaurant_reviews',
 u'restaurants',
 u'system.indexes',
 u'users']

In [6]:
biz = db.get_collection('restaurants')
users = db.get_collection('users')
reviews = db.get_collection('restaurant_reviews')

The business ID field has already been filtered for only restaurants

We want to filter the users collection for the following:

1. User must have at least 20 reviews
2. For users with 20 reviews, identify the reviews which are for businesses
3. For each user, keep only those reviews which are related to a business in 
the list of restaurant business IDs
4. Keep only users who have at least 500 reviews after finishing step 3

In [7]:
biguser = []

for obj in users.find({'review_count':{'$gt':500}}):
    biguser.append(obj['user_id'])

Create a new dictionary with the following structure and then export as a json object:

{user id: [review, review, review], ..., user id: [review, review, review]}


In [34]:
userreview = {}

for i in tqdm.tqdm(range(0,len(biguser[0:20]))):
    ulist = []
    for obj in reviews.find({'user_id':biguser[i]}):
        del obj['_id']
        ulist.append(obj)
    userreview[str(biguser[i])] = ulist


  0%|          | 0/20 [00:00<?, ?it/s]
  5%|▌         | 1/20 [00:01<00:19,  1.02s/it]
 10%|█         | 2/20 [00:01<00:17,  1.02it/s]
 15%|█▌        | 3/20 [00:02<00:16,  1.04it/s]
 20%|██        | 4/20 [00:03<00:15,  1.06it/s]
 25%|██▌       | 5/20 [00:04<00:13,  1.08it/s]
 30%|███       | 6/20 [00:05<00:12,  1.09it/s]
 35%|███▌      | 7/20 [00:06<00:11,  1.09it/s]
 40%|████      | 8/20 [00:07<00:11,  1.09it/s]
 45%|████▌     | 9/20 [00:08<00:10,  1.09it/s]

 50%|█████     | 10/20 [00:09<00:09,  1.10it/s]
 55%|█████▌    | 11/20 [00:10<00:08,  1.11it/s]
 60%|██████    | 12/20 [00:10<00:07,  1.10it/s]
 65%|██████▌   | 13/20 [00:11<00:06,  1.11it/s]
 70%|███████   | 14/20 [00:12<00:05,  1.12it/s]
 75%|███████▌  | 15/20 [00:13<00:04,  1.12it/s]
 80%|████████  | 16/20 [00:14<00:03,  1.12it/s]
 85%|████████▌ | 17/20 [00:15<00:02,  1.12it/s]
 90%|█████████ | 18/20 [00:16<00:01,  1.12it/s]
 95%|█████████▌| 19/20 [00:17<00:00,  1.13it/s]
100%|██████████| 20/20 [00:18<00:00,  1.11it/s]

In [23]:
with open('user_review_dictionary.json', 'w') as outfile:
    json.dump(userreview, outfile)

Get all the restaurant IDs within our user reviews


In [35]:
biznames =[]

for key in userreview.keys():
    for review in userreview[key]:
        biznames.append(review['business_id'])

For each of the businesses, find all of the reviews for that restaurant


In [36]:
restreview = {}

for i in tqdm.tqdm(range(0, len(biznames))):
    rlist = []
    for obj in reviews.find({'business_id':biznames[i]}):
        rlist.append(obj)
    restreview[biznames[i]] = rlist


100%|██████████| 996/996 [18:32<00:00,  1.10s/it]

In [ ]:
for key in restreview.keys():
    for review in restreview[key]:
        if '_id' in review: del review['_id']


with open('rest_review_dictionary.json', 'w') as outfile:
    json.dump(restreview, outfile)