In [1]:
import pandas as pd
import numpy as np
In [178]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [179]:
train.head(3)
Out[179]:
In [180]:
test.head(3)
Out[180]:
In [181]:
train = train.drop('Unnamed: 0', 1 )
test = test.drop('Unnamed: 0', 1 )
print train.shape
print test.shape
In [182]:
# fill na with zero
train = train.fillna(value = 0)
test = test.fillna(value = 0)
In [222]:
# define X and Y
#training set
array = train.values
X_train = array[:, 0:1838]
print X_train.shape
Y_train = array[:, 1838]
print Y_train.shape
#testing set
_array = test.values
X_test = _array[:, 0:1838]
print X_test.shape
Y_test = _array[:, 1838]
print Y_test.shape
In [207]:
len(X_train)
X_train = np.array(X_train).astype(int)
Y_train = np.array(Y_train).astype(int)
In [223]:
# select models to test on dataset
#import library
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
models=[]
models.append(('LOG', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
#models.append(('CART', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('NB', GaussianNB()))
In [224]:
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
results=[]
names=[]
for name, model in models:
kfold = KFold(n = 7497, n_folds = 10, random_state = 10)
_results = cross_validation.cross_val_score(model, X_train, Y_train, cv = kfold, scoring = 'accuracy')
results.append(_results)
names.append(name)
scores="%s: %f (%f)" % (name, _results.mean(), _results.std())
print scores
In [225]:
# Logistic regression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
print len(Y_pred)
#print accuracy_score(Y_test, Y_pred )
print confusion_matrix(Y_test,Y_pred )
print classification_report(Y_test, Y_pred)
In [226]:
# Tunning model
#pre-processing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
logreg.fit(X_train_scaled, Y_train)
Y_pred = logreg.predict(X_test_scaled)
print classification_report(Y_test, Y_pred)
In [300]:
from sklearn.ensemble import BaggingClassifier
# Apply ensemble method- Bagging
def build_bagging_model(x,y):
bagging = BaggingClassifier(LogisticRegression(),
n_estimators = 100,
random_state = 10,
max_samples = 1.0,
max_features = 0.7,
bootstrap = True,
bootstrap_features = True)
bagging.fit(x,y)
return bagging
# Build a bag of Logreg models
bagging = build_bagging_model(X_train, Y_train)
predicted_y = bagging.predict(X_train)
print "\n Bagging Model Accuracy on training data\n"
print classification_report(Y_train, predicted_y)
In [256]:
# create feature union
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_selection import SelectKBest
features = []
features.append(('pca', PCA(n_components = 3)))
features.append(('select_best', SelectKBest(k=6)))
feature_union = FeatureUnion(features)
#create pipeline
estimators = []
estimators.append(('feature_union', feature_union))
estimators.append(('logistic', LogisticRegression()))
model = Pipeline(estimators)
#evaluate pipeline
kfold = kfold = KFold(n = 7497, n_folds = 10, random_state = 10)
results = cross_val_score(model, X_train, Y_train, cv = kfold)
print(results.mean())
In [271]:
train_main = train.ix[:,0:10]
train_main['course_num'] = train['course_num']
train_main.head()
Out[271]:
In [274]:
%matplotlib inline
import seaborn as sns
sns.heatmap(train_main.corr())
Out[274]:
In [275]:
# check for normal distribution
for i in train_main:
rand_sample = train_main[i].sample(50, random_state=6)
print i,':\n', scipy.stats.mstats.normaltest(rand_sample)
sns.distplot(train_main[i])
plt.xlabel(i)
plt.show()
print
In [276]:
train.head(2)
Out[276]:
In [278]:
df_train = train.drop(['rating','bitter','meaty','piquant','salty','sour','sweet', 'numberofservings','totaltimeinseconds'], axis = 1)
In [291]:
df_test = test.drop(['rating','bitter','meaty','piquant','salty','sour','sweet', 'numberofservings','totaltimeinseconds'], axis = 1)
In [279]:
df_train.head(2)
Out[279]:
In [294]:
df_test.head(2)
Out[294]:
In [298]:
array = df_train.values
x = array[:, 0:1829]
y = array[:, 1829]
_array = df_test.values
x_t = _array[:, 0:1829]
y_t = _array[:, 1829]
In [288]:
#grid search
from sklearn.grid_search import GridSearchCV
alphas = np.array([0.1, 1.0,10.0,100.0,1000.0])
param_grid = dict(C=alphas)
model = LogisticRegression()
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid.fit(x,y)
print(grid.best_score_)
print(grid.best_estimator_)
In [299]:
y_pred = grid.predict(x_t)
print y_pred.shape
print classification_report(y_t, y_pred)
In [313]:
# add cuisine as column
BB_cuis = pd.read_csv('~/Desktop/Yummly/BB/BB_cuis.csv')
APP_cuis = pd.read_csv('~/Desktop/Yummly/APPETIZER/APP_cuis.csv')
DS_cuis = pd.read_csv('~/Desktop/Yummly/DESSERT/DS_cuis.csv')
SP_cuis = pd.read_csv('~/Desktop/Yummly/SOUP/SP_cuis.csv')
SLD_cuis = pd.read_csv('~/Desktop/Yummly/SALAD/SLD_cuis.csv')
In [314]:
print BB_cuis.shape
print APP_cuis.shape
print DS_cuis.shape
print SP_cuis.shape
print SLD_cuis.shape
In [316]:
BB_cuis.head(2)
Out[316]:
In [317]:
BB_cuis['course_num'] = 1
APP_cuis['course_num'] = 0
DS_cuis['course_num'] = 2
SP_cuis['course_num'] = 4
SLD_cuis['course_num'] = 3
In [319]:
cuis = pd.concat([APP_cuis, BB_cuis, DS_cuis, SLD_cuis, SP_cuis], axis = 0)
cuis.shape
Out[319]:
In [336]:
from sklearn import cross_validation
array = cuis.values
X = array[:, 1:27].astype(int)
print X.shape
Y = array[:, 27].astype(int)
print Y.shape
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, random_state=10)
print X_train.shape
print X_test.shape
In [337]:
#grid search
from sklearn.grid_search import GridSearchCV
alphas = np.array([0.1, 1.0,10.0,100.0,1000.0])
param_grid = dict(C=alphas)
model = LogisticRegression()
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid.fit(X_train,Y_train)
print(grid.best_score_)
print(grid.best_estimator_)
In [339]:
sns.heatmap(cuis.corr())
Out[339]:
In [ ]: