In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
Python package imports
In [2]:
import numpy as np
import os
import glob
import pandas as pd
import yelp_utils
from yelp_utils import *
import yelp_utils
print "Pandas version: " + pd.__version__
In [3]:
pd.options.display.max_columns = 200
In [4]:
yelp_utils.make_sure_path_exists(yelp_utils.YELP_DATA_CSV_DIR)
Convert the json files to flattened csv for clarity and ease of access.
In [5]:
# http://stackoverflow.com/questions/3207219/how-to-list-all-files-of-a-directory-in-python
for json_file in glob.glob(os.path.join(yelp_utils.YELP_DATA_RAW_DIR, "*.json")):
csv_file = '{0}.csv'.format(os.path.join(yelp_utils.YELP_DATA_CSV_DIR,os.path.basename(json_file).split('.json')[0]))
if not os.path.isfile(csv_file):
%run json_to_csv_converter.py $json_file $YELP_DATA_CSV_DIR
Read the csv to pandas dataframe
In [6]:
business_csv_file = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'yelp_academic_dataset_business.csv')
df_business = pd.read_csv(business_csv_file, encoding='utf-8', engine='c', low_memory=False)
yelp_utils.getDfInfo(df_business)
df_business.head()
Out[6]:
In [7]:
review_csv_file = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'yelp_academic_dataset_review.csv')
df_review = pd.read_csv(review_csv_file, encoding='utf-8', engine='c') # didnt use low memory option as file is too large
getDfInfo(df_review)
df_review.head()
Out[7]:
In [8]:
user_csv_file = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'yelp_academic_dataset_user.csv')
df_user = pd.read_csv(user_csv_file, encoding='utf-8', engine='c', low_memory=False)
getDfInfo(df_user)
df_user.head()
Out[8]:
In [9]:
def renameRemoveNACols(df, string):
'''
Function to remove columns from dataframe which contain NA value and
rename columns with string appended at beginning of column name.
Input:
df: Pandas data frame
string: The string that has to be appended at beginning of all column name
Output: Pandas data frame
'''
df = df.dropna(axis='columns', how='any')
new_columns = df.columns.values
new_columns = string + new_columns
df.columns = new_columns
return(df)
def removeNACols(df):
'''
Function to remove columns which contain NA value.
Input:
df: Pandas data frame
Output: Pandas data frame
'''
df = df.dropna(axis='columns', how='any')
return(df)
In [10]:
df_business = renameRemoveNACols(df_business, 'business_')
df_business.rename(columns={'business_business_id': 'business_id'}, inplace=True)
df_review = renameRemoveNACols(df_review, 'review_')
df_review.rename(columns={'review_review_id': 'review_id', 'review_user_id':'user_id', 'review_business_id' :'business_id'},
inplace=True)
df_user = renameRemoveNACols(df_user, 'user_')
df_user.rename(columns={'user_user_id': 'user_id'}, inplace=True)
# left_on='business_id', right_on='review_business_id')
In [11]:
df_merged_business_review = pd.merge(df_business, df_review, how ='inner', on='business_id')
getDfInfo(df_merged_business_review)
In [12]:
df_merged_business_review_user = pd.merge(df_merged_business_review, df_user, how ='inner', on='user_id')
# left_index ='review_user_id', right_index='user_id')
getDfInfo(df_merged_business_review_user)
df_merged_business_review_user.head()
Out[12]:
In [13]:
len(set(df_merged_business_review_user['review_id'])) == df_merged_business_review_user.shape[0]
Out[13]:
Considering reviews only in US
In [14]:
# https://answers.yahoo.com/question/index?qid=20070729220301AA6Ct4s
# +48.987386 is the northern most latitude
# +18.005611 is the southern most latitude
# -124.626080 is the west most longitude
# -62.361014 is a east most longitude
df_merged_business_review_user[['business_latitude', 'business_longitude', 'business_full_address']]
condition_latitude = (df_merged_business_review_user.business_latitude < 48.987386) \
& (df_merged_business_review_user.business_latitude > 18.005611)
condition_longitude = (df_merged_business_review_user.business_longitude > -124.626080) \
& (df_merged_business_review_user.business_longitude < -62.361014)
df_merged_business_review_user_only_us = df_merged_business_review_user.loc[condition_latitude & condition_longitude]
In [15]:
print df_merged_business_review_user.shape
print df_merged_business_review_user_only_us.shape
In [16]:
# Write business-review-user file
write_filename = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'business_review_user.csv')
if not os.path.isfile(write_filename):
df_merged_business_review_user_only_us.to_csv(write_filename, encoding='utf-8', engine='c', index=False)
else:
df_merged_business_review_user_only_us = pd.read_csv(write_filename, encoding='utf-8', engine='c');
print("read file")
In [17]:
sample_file = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'business_review_user'+ yelp_utils.data_subset +'.csv')
df_sample = df_merged_business_review_user_only_us.sample(frac = yelp_utils.data_frac, replace=False, random_state=yelp_utils.SEED_VAL)
if not os.path.isfile(sample_file):
df_sample.to_csv(sample_file, encoding='utf-8', engine='c', index=False)
df_sample.head()
Out[17]:
In [18]:
%whos
In [ ]: