In [9]:
import pandas as pd
import numpy as np
In [5]:
df = pd.read_clipboard()
In [6]:
df.head()
Out[6]:
In [7]:
df
Out[7]:
In [8]:
!pwd
In [16]:
df.to_csv("/home/topo/repos/ghub/mml_sklearn/datasets/animal_food_train.csv", header=True, index=False)
In [15]:
#pd.read_csv("/home/topo/repos/ghub/mml_sklearn/datasets/animal_food_train.csv")
In [17]:
test = pd.read_clipboard()
In [18]:
test
Out[18]:
In [19]:
test.to_csv("/home/topo/repos/ghub/mml_sklearn/datasets/animal_food_test.csv", header=True, index=False)
In [21]:
#pd.read_csv("/home/topo/repos/ghub/mml_sklearn/datasets/animal_food_test.csv")
In [22]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
In [54]:
if __name__=="__main__":
df = pd.read_csv("/home/topo/repos/ghub/mml_sklearn/datasets/ad-data/ad.data", header=None)
explanatory_variable_columns = set(df.columns.values)
response_variable_column = df[len(df.columns.values)-1]
# The last column describes the targets
explanatory_variable_columns.remove(len(df.columns.values)-1)
y = [1 if e == 'ad.' else 0 for e in response_variable_column]
X = df[list(explanatory_variable_columns)]
# print X.head(5)
X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline = Pipeline ( [
('clf', DecisionTreeClassifier(criterion='gini'))
])
parameters = {
'clf__max_depth': (50,80, 100, 120, 150, 155, 160),
'clf__min_samples_split': (1, 2, 3, 4, 5),
'clf__min_samples_leaf': (1, 2, 3, 4, 5)
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)
print best_parameters
In [55]:
# Using RandomForest Classifier for the same
from sklearn.ensemble import RandomForestClassifier
if __name__=="__main__":
df = pd.read_csv("/home/topo/repos/ghub/mml_sklearn/datasets/ad-data/ad.data", header=None)
explanatory_variable_columns = set(df.columns.values)
response_variable_column = df[len(df.columns.values)-1]
# The last column describes the targets
explanatory_variable_columns.remove(len(df.columns.values)-1)
y = [1 if e == 'ad.' else 0 for e in response_variable_column]
X = df[list(explanatory_variable_columns)]
# print X.head(5)
X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline = Pipeline ( [
('clf', RandomForestClassifier(criterion='gini'))
])
parameters = {
'clf__n_estimators': (5, 10, 20, 50),
'clf__max_depth': (50, 150, 250),
'clf__min_samples_split': (1, 2, 3),
'clf__min_samples_leaf': (1, 2, 3)
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)
print best_parameters
In [ ]: