In [1]:
import pandas as pd

In [2]:
def parse(path):
    with open(path, 'rb') as f:
        for line in f:
            yield eval(line)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient = 'index')

In [3]:
reviews = getDF('./data/Toys_and_Games_5.json')

In [10]:
### WARNING!!! Metadata has more than 3G
metadata = getDF('./data/metadata.json')

In [14]:
metadata.shape


Out[14]:
(9430088, 9)

In [15]:
import numpy as np
import urllib
import cv2

def url_to_image(url):
    resp = urllib.request.urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype='uint8')
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    
    return image

In [31]:
### Use a pre-trained CNN to extract image feature
from keras.applications.vgg19 import VGG19
from keras.preprocessing import image
from keras.applications.vgg19 import preprocess_input
from keras.models import Model
import numpy as np

base_model = VGG19(weights='imagenet')
#WHICH feature? the output of block5_pool or the output of fc2?
### for a single image, block5_pool output is 24.5KB, whereas fc2 output is 4KB
#model = Model(inputs=base_model.input, outputs=base_model.get_layer('block5_pool').output)
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

In [35]:
####TEST
row = next(metadata.iterrows())[1]
image_features = {}

asin = row['asin']
url = row['imUrl']

img = url_to_image(url)

img = cv2.resize(img, (224, 224))

x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
#block5_pool_features = model.predict(x)
fc2_features = model.predict(x)

#print(block5_pool_features.shape)

image_features[asin] = fc2_features.reshape(4096,)

img_df = pd.DataFrame.from_dict(image_features, orient = 'index')

In [ ]:
image_features = {}
### WARN it takes very long time to run this
for row in next(metadata.iterrows()):
    asin = row['asin']
    url = row['imUrl']
    img = url_to_image(url)
    img = cv2.resize(img, (224, 224))
    #img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    fc2_features = model.predict(x)
    image_features[asin] = fc2_features.reshape(4096,)
    #block5_pool_features = model.predict(x)
    #image_features[asin] = block5_pool_features.reshape(7*7*512,)

### Convert the dictionary into a pandas DataFrame
img_df = pd.DataFrame.from_dict(image_features, orient = 'index')