In [5]:
import pandas as pd
import json
import numpy as np
import ast
from pandas.io.json import json_normalize #package for flattening json in pandas df
from tqdm import tqdm
In [6]:
import mmap
def get_num_lines(file_path):
fp = open(file_path, "r+")
buf = mmap.mmap(fp.fileno(), 0)
lines = 0
while buf.readline():
lines += 1
return lines
In [7]:
root = "/media/felipe/SAMSUNG/AmazonReviews/"
In [8]:
reviews_df = pd.read_json(root+"/sample_reviews_Books_5.json",lines=True)
In [9]:
reviews_df.head(1)
Out[9]:
In [10]:
reviews_df = reviews_df[['asin','overall','reviewText']]
reviews_df.index = reviews_df['asin']
reviews_df.drop(['asin'],axis=1,inplace=True)
In [11]:
reviews_df['categories'] = np.nan
In [12]:
reviews_df.head()
Out[12]:
In [ ]:
with open(root+"/metadata.json") as f:
for line in tqdm(f, total=get_num_lines(root+"/metadata.json")):
json_data = ast.literal_eval(line)
other_df = json_normalize(json_data)
other_df['asin'] = other_df['asin'].astype('object')
other_df.index = other_df['asin']
other_df.drop(['asin'],axis=1,inplace=True)
if not 'categories' in other_df.columns.values:
other_df['categories'] = ''
reviews_df.update(other_df)
In [ ]:
reviews_df
In [ ]:
sample_metadata_df = pd.read_json(root+"/sample_metadata.json",lines=True)
In [ ]: