In [1]:
from pymongo import MongoClient
from datetime import datetime
import json
import pdb
import csv

In [2]:
ip = '54.227.180.242'

In [4]:
conn = MongoClient(ip, 27017)
conn.database_names()


Out[4]:
[u'local', u'cleaned_data']

In [13]:
db = conn.get_database('cleaned_data')

In [32]:
biz = db.get_collection('academic_biz')
users = db.get_collection('academic_users')
reviews = db.get_collection('academic_reviews')

The business ID field has already been filtered for only restaurants

We want to filter the users collection for the following:

1. User must have at least 20 reviews
2. For users with 20 reviews, identify the reviews which are for businesses
3. For each user, keep only those reviews which are related to a business in 
the list of restaurant business IDs
4. Keep only users who have at least 20 reviews after finishing step 3

In [48]:
#Find a list of users with at least 20 reviews
user_list = []
for user in users.find():
    if user['review_count'] >= 20:
        user_list.append(user['_id'])
    else:
        pass

Create a new dictionary with the following structure and then export as a json object:

{user id: [review, review, review], ..., user id: [review, review, review]}


In [96]:
user_reviews = dict.fromkeys(user_list, 0)
for review in reviews.find():
    try:
        if user_reviews[review['_id']] == 0:
            print review['_id']
            print review
            break
    except KeyError:
        pass
#             user_reviews[review['_id']] = [review]
#         else:
#             user_reviews[review['_id']].append(review)
        
#     except KeyError:
#         pass


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-96-73748fcac929> in <module>()
      1 user_reviews = dict.fromkeys(user_list, 0)
----> 2 for review in reviews.find():
      3     try:
      4         if user_reviews[review['_id']] == 0:
      5             print review['_id']

/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/pymongo/cursor.pyc in next(self)
   1088             raise StopIteration
   1089         _db = self.__collection.database
-> 1090         if len(self.__data) or self._refresh():
   1091             if self.__manipulate:
   1092                 return _db._fix_outgoing(self.__data.popleft(),

/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/pymongo/cursor.pyc in _refresh(self)
   1030                                              self.__id,
   1031                                              self.__codec_options,
-> 1032                                              self.__max_await_time_ms))
   1033 
   1034         else:  # Cursor id is zero nothing else to return

/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/pymongo/cursor.pyc in __send_message(self, operation)
    901             doc = helpers._unpack_response(response=data,
    902                                            cursor_id=self.__id,
--> 903                                            codec_options=self.__codec_options)
    904             if from_command:
    905                 helpers._check_command_response(doc['data'][0])

/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/pymongo/helpers.pyc in _unpack_response(response, cursor_id, codec_options)
    140               "starting_from": struct.unpack("<i", response[12:16])[0],
    141               "number_returned": struct.unpack("<i", response[16:20])[0],
--> 142               "data": bson.decode_all(response[20:], codec_options)}
    143 
    144     assert len(result["data"]) == result["number_returned"]

/Users/robertsonwang/anaconda2/lib/python2.7/site-packages/bson/objectid.pyc in __init__(self, oid)
     66     _type_marker = 7
     67 
---> 68     def __init__(self, oid=None):
     69         """Initialize a new ObjectId.
     70 

KeyboardInterrupt: 

In [93]:
user_reviews[user_reviews.keys()[23]]


Out[93]:
0

In [86]:
filtered_reviews = {}
for user in user_reviews.keys():
    if user_reviews[user] != 0:
        filtered_reviews[user] = user_reviews[user]

In [87]:
#We have this many users after our filtering
len(filtered_reviews)


Out[87]:
0

In [37]:
#Dump file of cleaned up user data
with open('merged_user_reviews.json', 'w') as fp:
    json.dump(user_reviews, fp)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-37-d3878a915332> in <module>()
      1 user_reviews = dict()
----> 2 user_reviews['a']

KeyError: 'a'