notebook.community

Edit and run



In [55]:

    
import numpy as np
from sklearn.decomposition import PCA
import pickle



In [56]:

    
all_features = pickle.load(open('master_features_filtered.p', 'rb'))
print(len(all_features))



In [57]:

    
#Remove the test data that AL created
al_test_features = pickle.load(open('test_features.p', 'rb'))
print(len(al_test_features))

for key in al_test_features.keys():
    if key in all_features.keys():
        del all_features[key]

print(len(all_features))



In [58]:

    
paul_features = pickle.load(open('paul_features.p', 'rb'))
print(len(paul_features))



In [59]:

    
set1  = set(paul_features.keys())
set2  = set(all_features.keys())
c = set1.intersection(set2)
print(len(c))



In [60]:

    
paul_features.update(all_features)
print(len(paul_features))



In [65]:

    
#Get all the recipes
all_recipes = pickle.load(open('new_master_recipe.p', 'rb'))
new_recipes = pickle.load(open('hunted_recipes2.p', 'rb'))

print(len(all_recipes))
print(len(new_recipes))

all_recipes.update(new_recipes)
print(len(all_recipes))



In [67]:

    
new_dict = dict()

#Remove something
for key in all_recipes.keys():
    value = all_recipes[key]
    value.replace("Add all ingredients to list\n","")
    new_dict[key] = value

print(len(new_dict))
print(len(all_recipes))



In [72]:

    
image_keys = paul_features.keys()
recipe_keys = all_recipes.keys()

count = 0

for key in image_keys:
    if key not in recipe_keys:
        del paul_features[key]

for key in recipe_keys:
    if key not in image_keys:
        del new_dict[key]

print(len(paul_features))
print(len(new_dict))



In [73]:

    
#Cross verify that they match with image feature ids
set1 = set(paul_features.keys())
set2 = set(new_dict.keys())
print(len(set1.symmetric_difference(set2)))



In [83]:

    
#Split the dataset into training set, validation set and test set

d1 = dict(paul_features.items()[35000:])
train = dict(paul_features.items()[:35000])

print(len(train))

val = dict(d1.items()[4399:])
test = dict(d1.items()[:4399])
test.update(al_test_features)
print(len(val))
print(len(test))



In [84]:

    
pickle.dump(train, open( "training_set_features.p", "wb" ) )
pickle.dump(val, open( "validation_set_features.p", "wb" ) )
pickle.dump(test, open( "test_set_features.p", "wb" ) )



In [78]:

    
val_recipe = dict()
train_recipe = dict()
test_recipe = dict()

for key in paul_features.keys():
    if key in train.keys():
        train_recipe[key] = new_dict[key]
    elif key in val.keys():
        val_recipe[key] = new_dict[key]
    elif key in test.keys():
        test_recipe[key] = new_dict[key]
        
print(len(train_recipe))
print(len(val_recipe))
print(len(test_recipe))



In [79]:

    
pickle.dump(train_recipe, open( "training_set_recipes.p", "wb" ) )
pickle.dump(val_recipe, open( "validation_set_recipes.p", "wb" ) )
pickle.dump(test_recipe, open( "test_set_recipes.p", "wb" ) )



In [80]:

    
print("The length and shape of training before PCA")
print(len(train))
print(len(train.itervalues().next()))
print("The length and shape of test before PCA")
print(len(test))
print(len(test.itervalues().next()))

print("The length and shape of val before PCA")
print(len(val))
print(len(val.itervalues().next()))

pca = PCA(copy=True, n_components=512, whiten=True)
pca_train_features = pca.fit_transform(np.asarray(train.values()))
pca_test_features =  pca.transform(np.asarray(test.values()))
pca_val_features =  pca.transform(np.asarray(val.values()))

print(len(pca_train_features))
print(len(pca_test_features))
print(len(pca_val_features))

print(type(pca_train_features))
print(type(pca_test_features))
print(type(pca_val_features))









    



The length and shape of training before PCA
35000
2048
The length and shape of test before PCA
4399
2048
The length and shape of val before PCA
4399
2048
35000
4399
4399
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>



In [81]:

    
print(np.shape(pca_train_features))
print(np.shape(pca_test_features))
print(np.shape(pca_val_features))









    



(35000L, 512L)
(4399L, 512L)
(4399L, 512L)



In [82]:

    
pickle.dump(pca_train_features, open( "pca_train_features.p", "wb" ) )
pickle.dump(pca_test_features, open( "pca_test_features.p", "wb" ) )
pickle.dump(pca_val_features, open( "pca_val_features.p", "wb" ) )



In [ ]: