In [5]:

    
import pandas as pd
import json
import numpy as np
import ast
from pandas.io.json import json_normalize #package for flattening json in pandas df
from tqdm import tqdm



In [6]:

    
import mmap

def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines

sample the data so that it looks like this:

TEXT | product categories | rating given



In [7]:

    
root = "/media/felipe/SAMSUNG/AmazonReviews/"



In [8]:

    
reviews_df = pd.read_json(root+"/sample_reviews_Books_5.json",lines=True)



In [9]:

    
reviews_df.head(1)









    Out[9]:







  
    
      
      asin
      helpful
      overall
      reviewText
      reviewTime
      reviewerID
      reviewerName
      summary
      unixReviewTime
    
  
  
    
      0
      0441019420
      [0, 0]
      4
      i'm a big Patricia Briggs fan. i love her stor...
      11 7, 2010
      A18O0N0QI055FU
      Summer D. Olson "fredomoftruth"
      great book
      1289088000



In [10]:

    
reviews_df = reviews_df[['asin','overall','reviewText']]
reviews_df.index = reviews_df['asin']
reviews_df.drop(['asin'],axis=1,inplace=True)



In [11]:

    
reviews_df['categories'] = np.nan



In [12]:

    
reviews_df.head()









    Out[12]:







  
    
      
      overall
      reviewText
      categories
    
    
      asin
      
      
      
    
  
  
    
      0441019420
      4
      i'm a big Patricia Briggs fan. i love her stor...
      NaN
    
    
      0316051632
      5
      The Disappearing Spoon by Sam KeanLittle, Brow...
      NaN
    
    
      038573901X
      2
      I enjoy a great romance novel now and again, a...
      NaN
    
    
      006144295X
      1
      I've read all of Kimberla Lawson-Roby novels. ...
      NaN
    
    
      0316097543
      3
      This book almost made it.  The jump from starv...
      NaN



In [ ]:

    
with open(root+"/metadata.json") as f:
    for line in tqdm(f, total=get_num_lines(root+"/metadata.json")):
        json_data = ast.literal_eval(line)
        other_df = json_normalize(json_data)
        other_df['asin'] = other_df['asin'].astype('object')
        other_df.index = other_df['asin']
        other_df.drop(['asin'],axis=1,inplace=True)
               
        if not 'categories' in other_df.columns.values:
            other_df['categories'] = ''
           
        reviews_df.update(other_df)









    



  0%|          | 7736/9430088 [00:39<13:21:05, 196.03it/s]



In [ ]:

    
reviews_df



In [ ]:

    
sample_metadata_df = pd.read_json(root+"/sample_metadata.json",lines=True)



In [ ]:

	overall	reviewText	categories
asin
0441019420	4	i'm a big Patricia Briggs fan. i love her stor...	NaN
0316051632	5	The Disappearing Spoon by Sam KeanLittle, Brow...	NaN
038573901X	2	I enjoy a great romance novel now and again, a...	NaN
006144295X	1	I've read all of Kimberla Lawson-Roby novels. ...	NaN
0316097543	3	This book almost made it. The jump from starv...	NaN