In [1]:
import numpy as np
import pandas as pd
from pandas import *
import csv
In [44]:
#create a list of csvs
files = !ls *.csv
#load into a df
df_list = [pd.read_csv(file) for file in files]
In [45]:
files
Out[45]:
In [279]:
data = pd.concat(df_list, axis = 0)
In [280]:
print data.shape
data.head(2)
Out[280]:
In [281]:
# labelencode the course column
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['course_num'] = le.fit_transform(data.course)
In [282]:
#make all column headers, small cases
data.columns = [col.lower() for col in data.columns]
In [283]:
data.isnull().sum()
Out[283]:
In [284]:
data = data.drop(['cooktimeinseconds','preptimeinseconds', 'course'], axis = 1)
In [285]:
# #clean cook/preptime columns
# for index, row in data.iterrows():
# if row['cooktimeinseconds'].isnull() and row['preptimeinseconds'].isnull():
# print row
In [286]:
# clean the ingredient column
import string
# strip non ascii characters
printable = set(string.printable)
ingredient = [filter(lambda x: x in printable, z) for z in data['ingredient_list']]
In [287]:
ingredient[0]
Out[287]:
In [288]:
# remove the braquets
ingredient = [z[1:-1] for z in ingredient]
In [289]:
ingredient[0]
Out[289]:
In [290]:
ingredient = [z.split(" ") for z in ingredient]
In [291]:
ingredient[0]
Out[291]:
In [292]:
# perform word lemmatization
from nltk import stem
wordnet_lemm = stem.WordNetLemmatizer()
lem_ingredient = [] # create a new list of lemmatized ingredient list for each recipe
for ing in ingredient:
lem_ingredient.append([wordnet_lemm.lemmatize(w) for w in ing])
# convert back to a string for vectorization
lem_ingredient = [' '.join(z) for z in lem_ingredient]
In [293]:
data['ingredient_lem'] = lem_ingredient
data = data.drop('ingredient_list', axis = 1)
data.head(2)
Out[293]:
In [294]:
# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
# Assign 'ingredient string' column to 'corpus' variable
# corpus = data['ingredient_lem']
# fit
# dense = vect.fit_transform(corpus).todense()
In [301]:
#array = data.values
# create a validation set
from sklearn import cross_validation
validation_size = 0.20
X = data[['rating', 'bitter', 'meaty', 'piquant', 'salty', 'sour', 'sweet', 'ingredientcount', 'numberofservings',
'totaltimeinseconds', 'ingredient_lem']]
print X.shape
Y = data['course_num']
print Y.shape
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(X, Y, test_size = validation_size, random_state=10)
print X_train.shape
print Y_train.shape
In [ ]: