In [2]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
In [14]:
df = pd.read_csv("../../data/homes_sf_ny/data.csv")
In [15]:
df.head()
Out[15]:
In [16]:
df['elevation'].median()
Out[16]:
In [ ]:
len(df)
In [ ]:
len(df[df['in_sf'] == 0]),len(df[df['in_sf'] == 1])
In [ ]:
df1 = df[(df['elevation'] <= 30) & (df['in_sf'] == 0) ]
df2 = df[(df['elevation'] <= 30) & (df['in_sf'] == 1) ]
df3 = df[(df['elevation'] > 30) & (df['in_sf'] == 0) ]
df4 = df[(df['elevation'] > 30) & (df['in_sf'] == 1) ]
len(df1),len(df2),len(df3),len(df4)
In [ ]:
In [3]:
f = open("../../data/homes_sf_ny/data.csv")
f.readline()
data = np.loadtxt(f,delimiter=",")
clf = tree.DecisionTreeClassifier()
Y = data[:,0]
X = data[:,1:]
clf = clf.fit(X,Y)
In [18]:
from IPython.display import Image
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data,
feature_names=["beds","bath","price","year_built","sqft","price_per_sqft","elevation"],
class_names=["NY","SF"],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
Out[18]:
In [22]:
clf.get_params
Out[22]: