In [55]:
import numpy as np
from sklearn.decomposition import PCA
import pickle
In [56]:
all_features = pickle.load(open('master_features_filtered.p', 'rb'))
print(len(all_features))
In [57]:
#Remove the test data that AL created
al_test_features = pickle.load(open('test_features.p', 'rb'))
print(len(al_test_features))
for key in al_test_features.keys():
if key in all_features.keys():
del all_features[key]
print(len(all_features))
In [58]:
paul_features = pickle.load(open('paul_features.p', 'rb'))
print(len(paul_features))
In [59]:
set1 = set(paul_features.keys())
set2 = set(all_features.keys())
c = set1.intersection(set2)
print(len(c))
In [60]:
paul_features.update(all_features)
print(len(paul_features))
In [65]:
#Get all the recipes
all_recipes = pickle.load(open('new_master_recipe.p', 'rb'))
new_recipes = pickle.load(open('hunted_recipes2.p', 'rb'))
print(len(all_recipes))
print(len(new_recipes))
all_recipes.update(new_recipes)
print(len(all_recipes))
In [67]:
new_dict = dict()
#Remove something
for key in all_recipes.keys():
value = all_recipes[key]
value.replace("Add all ingredients to list\n","")
new_dict[key] = value
print(len(new_dict))
print(len(all_recipes))
In [72]:
image_keys = paul_features.keys()
recipe_keys = all_recipes.keys()
count = 0
for key in image_keys:
if key not in recipe_keys:
del paul_features[key]
for key in recipe_keys:
if key not in image_keys:
del new_dict[key]
print(len(paul_features))
print(len(new_dict))
In [73]:
#Cross verify that they match with image feature ids
set1 = set(paul_features.keys())
set2 = set(new_dict.keys())
print(len(set1.symmetric_difference(set2)))
In [83]:
#Split the dataset into training set, validation set and test set
d1 = dict(paul_features.items()[35000:])
train = dict(paul_features.items()[:35000])
print(len(train))
val = dict(d1.items()[4399:])
test = dict(d1.items()[:4399])
test.update(al_test_features)
print(len(val))
print(len(test))
In [84]:
pickle.dump(train, open( "training_set_features.p", "wb" ) )
pickle.dump(val, open( "validation_set_features.p", "wb" ) )
pickle.dump(test, open( "test_set_features.p", "wb" ) )
In [78]:
val_recipe = dict()
train_recipe = dict()
test_recipe = dict()
for key in paul_features.keys():
if key in train.keys():
train_recipe[key] = new_dict[key]
elif key in val.keys():
val_recipe[key] = new_dict[key]
elif key in test.keys():
test_recipe[key] = new_dict[key]
print(len(train_recipe))
print(len(val_recipe))
print(len(test_recipe))
In [79]:
pickle.dump(train_recipe, open( "training_set_recipes.p", "wb" ) )
pickle.dump(val_recipe, open( "validation_set_recipes.p", "wb" ) )
pickle.dump(test_recipe, open( "test_set_recipes.p", "wb" ) )
In [80]:
print("The length and shape of training before PCA")
print(len(train))
print(len(train.itervalues().next()))
print("The length and shape of test before PCA")
print(len(test))
print(len(test.itervalues().next()))
print("The length and shape of val before PCA")
print(len(val))
print(len(val.itervalues().next()))
pca = PCA(copy=True, n_components=512, whiten=True)
pca_train_features = pca.fit_transform(np.asarray(train.values()))
pca_test_features = pca.transform(np.asarray(test.values()))
pca_val_features = pca.transform(np.asarray(val.values()))
print(len(pca_train_features))
print(len(pca_test_features))
print(len(pca_val_features))
print(type(pca_train_features))
print(type(pca_test_features))
print(type(pca_val_features))
In [81]:
print(np.shape(pca_train_features))
print(np.shape(pca_test_features))
print(np.shape(pca_val_features))
In [82]:
pickle.dump(pca_train_features, open( "pca_train_features.p", "wb" ) )
pickle.dump(pca_test_features, open( "pca_test_features.p", "wb" ) )
pickle.dump(pca_val_features, open( "pca_val_features.p", "wb" ) )
In [ ]: