In [1]:
import pandas as pd
In [2]:
def parse(path):
with open(path, 'rb') as f:
for line in f:
yield eval(line)
def getDF(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
return pd.DataFrame.from_dict(df, orient = 'index')
In [3]:
reviews = getDF('./data/Toys_and_Games_5.json')
In [10]:
### WARNING!!! Metadata has more than 3G
metadata = getDF('./data/metadata.json')
In [14]:
metadata.shape
Out[14]:
In [15]:
import numpy as np
import urllib
import cv2
def url_to_image(url):
resp = urllib.request.urlopen(url)
image = np.asarray(bytearray(resp.read()), dtype='uint8')
image = cv2.imdecode(image, cv2.IMREAD_COLOR)
return image
In [31]:
### Use a pre-trained CNN to extract image feature
from keras.applications.vgg19 import VGG19
from keras.preprocessing import image
from keras.applications.vgg19 import preprocess_input
from keras.models import Model
import numpy as np
base_model = VGG19(weights='imagenet')
#WHICH feature? the output of block5_pool or the output of fc2?
### for a single image, block5_pool output is 24.5KB, whereas fc2 output is 4KB
#model = Model(inputs=base_model.input, outputs=base_model.get_layer('block5_pool').output)
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)
In [35]:
####TEST
row = next(metadata.iterrows())[1]
image_features = {}
asin = row['asin']
url = row['imUrl']
img = url_to_image(url)
img = cv2.resize(img, (224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
#block5_pool_features = model.predict(x)
fc2_features = model.predict(x)
#print(block5_pool_features.shape)
image_features[asin] = fc2_features.reshape(4096,)
img_df = pd.DataFrame.from_dict(image_features, orient = 'index')
In [ ]:
image_features = {}
### WARN it takes very long time to run this
for row in next(metadata.iterrows()):
asin = row['asin']
url = row['imUrl']
img = url_to_image(url)
img = cv2.resize(img, (224, 224))
#img = image.load_img(img_path, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
fc2_features = model.predict(x)
image_features[asin] = fc2_features.reshape(4096,)
#block5_pool_features = model.predict(x)
#image_features[asin] = block5_pool_features.reshape(7*7*512,)
### Convert the dictionary into a pandas DataFrame
img_df = pd.DataFrame.from_dict(image_features, orient = 'index')