In [156]:
import pandas as pd
%matplotlib inline
from sklearn import datasets, tree, metrics
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
In [157]:
import sklearn; print(sklearn.__version__)
In [158]:
iris = datasets.load_iris()
In [159]:
iris.keys()
Out[159]:
In [160]:
X = iris.data[:,2:]
y = iris.target
In [161]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.25,train_size=0.75)
#What is random_state?
#What is stratify?
In [162]:
#What is this doing in the moon example exactly?
#X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
In [163]:
forest = RandomForestClassifier(n_estimators=5, random_state=100)
forest.fit(X_train, y_train)
Out[163]:
In [164]:
print("accuracy on training set: %f" % forest.score(X_train, y_train))
print("accuracy on test set: %f" % forest.score(X_test, y_test))
In [165]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
In [166]:
dt = tree.DecisionTreeClassifier()
In [167]:
dt = dt.fit(X_train,y_train)
In [168]:
y_pred=dt.predict(X_test)
In [145]:
Accuracy_score = metrics.accuracy_score(y_test, y_pred)
In [146]:
Accuracy_score
Out[146]:
In [ ]:
#Comments on RandomForestClassifiers & Original Decision Tree Model
#While the Random Trees result is consistent, varying depending how you choose the random_state or the n_estimators,
#the result of the orgininal decision tree model varies a lot.
#The random_state defines how random the versions of the data is that the modelling takes into consideration, and
#the n_estimators regulates how many "random" datasets are used. It's fascinating to see how the this makes the
#result so much more consistent than the orginal decision tree model.
In [147]:
#General commets on the homework
#I really enjoyed this homework and it really helped me understand, what is going on under the hood.
#I found this reading while I was doing the homework. It looks nice to go deeper? Do you know the
#guy? https://github.com/amueller/introduction_to_ml_with_python
#I feel I now need practice on real life dirty data sets, to fully understand how predictions models
#can work. I take my comments back, that I can't see how I can implement this into my reporting. I can. But how
#can I do this technically? i.e. with the data on PERM visas? Say input nationality, wage, lawyer, job title, and get a reply what the chances could be of
#getting a work visa? I also feel a little shaky on how I need to prep my data to feed in it into the predictor
#correctly.
In [ ]:
#Comments on classifier
#Questions:
#Not sure why it's 10fold cross validation, cv is set at 5?
#Why are we predicting the
In [ ]: