In [5]:
import pandas as pd
import json
import numpy as np
import ast
from pandas.io.json import json_normalize #package for flattening json in pandas df
from tqdm import tqdm

In [6]:
import mmap

def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines

sample the data so that it looks like this:

TEXT | product categories | rating given


In [7]:
root = "/media/felipe/SAMSUNG/AmazonReviews/"

In [8]:
reviews_df = pd.read_json(root+"/sample_reviews_Books_5.json",lines=True)

In [9]:
reviews_df.head(1)


Out[9]:
asin helpful overall reviewText reviewTime reviewerID reviewerName summary unixReviewTime
0 0441019420 [0, 0] 4 i'm a big Patricia Briggs fan. i love her stor... 11 7, 2010 A18O0N0QI055FU Summer D. Olson "fredomoftruth" great book 1289088000

In [10]:
reviews_df = reviews_df[['asin','overall','reviewText']]
reviews_df.index = reviews_df['asin']
reviews_df.drop(['asin'],axis=1,inplace=True)

In [11]:
reviews_df['categories'] = np.nan

In [12]:
reviews_df.head()


Out[12]:
overall reviewText categories
asin
0441019420 4 i'm a big Patricia Briggs fan. i love her stor... NaN
0316051632 5 The Disappearing Spoon by Sam KeanLittle, Brow... NaN
038573901X 2 I enjoy a great romance novel now and again, a... NaN
006144295X 1 I've read all of Kimberla Lawson-Roby novels. ... NaN
0316097543 3 This book almost made it. The jump from starv... NaN

In [ ]:
with open(root+"/metadata.json") as f:
    for line in tqdm(f, total=get_num_lines(root+"/metadata.json")):
        json_data = ast.literal_eval(line)
        other_df = json_normalize(json_data)
        other_df['asin'] = other_df['asin'].astype('object')
        other_df.index = other_df['asin']
        other_df.drop(['asin'],axis=1,inplace=True)
               
        if not 'categories' in other_df.columns.values:
            other_df['categories'] = ''
           
        reviews_df.update(other_df)


  0%|          | 7736/9430088 [00:39<13:21:05, 196.03it/s]

In [ ]:
reviews_df

In [ ]:
sample_metadata_df = pd.read_json(root+"/sample_metadata.json",lines=True)

In [ ]: