In [368]:
import numpy as np
import pandas as pd
from pandas import *
import csv
In [369]:
#create a list of csvs
files = !ls *.csv
#load into a df
df_list = [pd.read_csv(file) for file in files]
In [370]:
files
Out[370]:
In [371]:
data = pd.concat(df_list, axis = 0)
In [372]:
print data.shape
data.head(2)
Out[372]:
In [373]:
# labelencode the course column
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['course_num'] = le.fit_transform(data.course)
In [374]:
#make all column headers, small cases
data.columns = [col.lower() for col in data.columns]
In [375]:
data.isnull().sum()
Out[375]:
In [376]:
data = data.drop(['cooktimeinseconds','preptimeinseconds', 'course'], axis = 1)
In [378]:
# #clean cook/preptime columns
# for index, row in data.iterrows():
# if row['cooktimeinseconds'].isnull() and row['preptimeinseconds'].isnull():
# print row
In [379]:
# clean the ingredient column
import string
# strip non ascii characters
printable = set(string.printable)
ingredient = [filter(lambda x: x in printable, z) for z in data['ingredient_list']]
In [380]:
ingredient[0]
Out[380]:
In [381]:
# remove the braquets
ingredient = [z[1:-1] for z in ingredient]
In [382]:
ingredient[0]
Out[382]:
In [383]:
ingredient = [z.split(" ") for z in ingredient]
In [384]:
ingredient[0]
Out[384]:
In [385]:
# perform word lemmatization
from nltk import stem
wordnet_lemm = stem.WordNetLemmatizer()
lem_ingredient = [] # create a new list of lemmatized ingredient list for each recipe
for ing in ingredient:
lem_ingredient.append([wordnet_lemm.lemmatize(w) for w in ing])
# convert back to a string for vectorization
lem_ingredient = [' '.join(z) for z in lem_ingredient]
In [386]:
data['ingredient_lem'] = lem_ingredient
data = data.drop('ingredient_list', axis = 1)
data.head(2)
Out[386]:
In [387]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
In [388]:
array = data.values
from sklearn import cross_validation
X = data['ingredient_lem']
print X.shape
Y = data['course_num']
print Y.shape
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, random_state=10)
print X_train.shape
print Y_train.shape
In [389]:
#vectorize and convert the matrix into array
X_train_vect = vect.fit_transform(X_train)
X_train_vect = X_train_vect.toarray()
X_test_vect = vect.transform(X_test)
X_test_vect = X_test_vect.toarray()
In [390]:
#create a list of the feature names
features = vect.get_feature_names()
len(features)
Out[390]:
In [402]:
#create dataframe of the vectorized dataset
train_vect_df = pd.DataFrame(data = X_train_vect, columns = features)
train_vect_df['course_num'] = Y_train.values
test_vect_df = pd.DataFrame(data = X_test_vect, columns = features)
test_vect_df['course_num'] = Y_test.values
print train_vect_df.shape
print test_vect_df.shape
In [406]:
# train/test split the other columns from the main dataframe
array = data.values
_X = array[:, 1:11]
print _X.shape
Y = data['course_num']
print Y.shape
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(_X, Y, random_state=10)
print X_train.shape
print Y_train.shape
In [407]:
# create a dataframe for training and test set
train_df = pd.DataFrame(data = X_train, columns = list(data.columns[1:11]))
test_df = pd.DataFrame(data = X_test, columns = list(data.columns[1:11]))
print train_df.shape
In [408]:
# combine the dataframes for the training set and for the testing set. The new dataframes will have the vectorized
# columns and selected features together.
data_train = pd.concat([train_df, train_vect_df], axis = 1)
data_test = pd.concat([test_df, test_vect_df], axis = 1)
In [ ]: