In [55]:
import numpy as np
from sklearn.decomposition import PCA
import pickle

In [56]:
all_features = pickle.load(open('master_features_filtered.p', 'rb'))
print(len(all_features))


36377

In [57]:
#Remove the test data that AL created
al_test_features = pickle.load(open('test_features.p', 'rb'))
print(len(al_test_features))

for key in al_test_features.keys():
    if key in all_features.keys():
        del all_features[key]

print(len(all_features))


136
36282

In [58]:
paul_features = pickle.load(open('paul_features.p', 'rb'))
print(len(paul_features))


10493

In [59]:
set1  = set(paul_features.keys())
set2  = set(all_features.keys())
c = set1.intersection(set2)
print(len(c))


2522

In [60]:
paul_features.update(all_features)
print(len(paul_features))


44253

In [65]:
#Get all the recipes
all_recipes = pickle.load(open('new_master_recipe.p', 'rb'))
new_recipes = pickle.load(open('hunted_recipes2.p', 'rb'))

print(len(all_recipes))
print(len(new_recipes))

all_recipes.update(new_recipes)
print(len(all_recipes))


36377
7490
43867

In [67]:
new_dict = dict()

#Remove something
for key in all_recipes.keys():
    value = all_recipes[key]
    value.replace("Add all ingredients to list\n","")
    new_dict[key] = value

print(len(new_dict))
print(len(all_recipes))


43867
43867

In [72]:
image_keys = paul_features.keys()
recipe_keys = all_recipes.keys()

count = 0

for key in image_keys:
    if key not in recipe_keys:
        del paul_features[key]

for key in recipe_keys:
    if key not in image_keys:
        del new_dict[key]

print(len(paul_features))
print(len(new_dict))


43798
43798

In [73]:
#Cross verify that they match with image feature ids
set1 = set(paul_features.keys())
set2 = set(new_dict.keys())
print(len(set1.symmetric_difference(set2)))


0

In [83]:
#Split the dataset into training set, validation set and test set

d1 = dict(paul_features.items()[35000:])
train = dict(paul_features.items()[:35000])

print(len(train))

val = dict(d1.items()[4399:])
test = dict(d1.items()[:4399])
test.update(al_test_features)
print(len(val))
print(len(test))


35000
4399
4529

In [84]:
pickle.dump(train, open( "training_set_features.p", "wb" ) )
pickle.dump(val, open( "validation_set_features.p", "wb" ) )
pickle.dump(test, open( "test_set_features.p", "wb" ) )

In [78]:
val_recipe = dict()
train_recipe = dict()
test_recipe = dict()

for key in paul_features.keys():
    if key in train.keys():
        train_recipe[key] = new_dict[key]
    elif key in val.keys():
        val_recipe[key] = new_dict[key]
    elif key in test.keys():
        test_recipe[key] = new_dict[key]
        
print(len(train_recipe))
print(len(val_recipe))
print(len(test_recipe))


35000
4399
4399

In [79]:
pickle.dump(train_recipe, open( "training_set_recipes.p", "wb" ) )
pickle.dump(val_recipe, open( "validation_set_recipes.p", "wb" ) )
pickle.dump(test_recipe, open( "test_set_recipes.p", "wb" ) )

In [80]:
print("The length and shape of training before PCA")
print(len(train))
print(len(train.itervalues().next()))
print("The length and shape of test before PCA")
print(len(test))
print(len(test.itervalues().next()))

print("The length and shape of val before PCA")
print(len(val))
print(len(val.itervalues().next()))

pca = PCA(copy=True, n_components=512, whiten=True)
pca_train_features = pca.fit_transform(np.asarray(train.values()))
pca_test_features =  pca.transform(np.asarray(test.values()))
pca_val_features =  pca.transform(np.asarray(val.values()))

print(len(pca_train_features))
print(len(pca_test_features))
print(len(pca_val_features))

print(type(pca_train_features))
print(type(pca_test_features))
print(type(pca_val_features))


The length and shape of training before PCA
35000
2048
The length and shape of test before PCA
4399
2048
The length and shape of val before PCA
4399
2048
35000
4399
4399
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>

In [81]:
print(np.shape(pca_train_features))
print(np.shape(pca_test_features))
print(np.shape(pca_val_features))


(35000L, 512L)
(4399L, 512L)
(4399L, 512L)

In [82]:
pickle.dump(pca_train_features, open( "pca_train_features.p", "wb" ) )
pickle.dump(pca_test_features, open( "pca_test_features.p", "wb" ) )
pickle.dump(pca_val_features, open( "pca_val_features.p", "wb" ) )

In [ ]: