In [1]:
import pandas as pd
import numpy as np
In [2]:
data_path = "C:/Users/Rishu/Desktop/dATA/boston/"
boston_data=pd.read_csv(data_path+'train.csv')
boston_data.info()
In [3]:
boston_data.head()
Out[3]:
In [4]:
boston_data_test=pd.read_csv(data_path+'test.csv')
boston_data_test.head()
Out[4]:
In [5]:
boston_data.describe()
Out[5]:
In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.distplot(boston_data['medv'], rug=True, color="b")
plt.axvline(boston_data['medv'].mean(), color="b", linestyle='solid', linewidth=2)
plt.axvline(boston_data['medv'].median(), color="b", linestyle='dashed', linewidth=2)
plt.show()
print ("Mean Price value :",boston_data['medv'].mean())
print ("Standard Deviation:",boston_data['medv'].std())
From the above plot we can understand that the price of the houses ranges in an average price range of: 22.76 (in $1000). So any ML algorithm with a bad accuracy will end up predicting the mean value.
From the data set, let us consider the following features (only the prime contenders out of 15 features):
In [7]:
ax = plt.subplots(figsize = (14,7))
sns.heatmap(boston_data[['crim','indus','nox','rm','dis','rad','tax','ptratio','black','lstat','medv']].corr(),
linecolor = 'white', square=True, annot=True)
plt.show()
In [8]:
sns.jointplot(x='lstat', y='medv', data=boston_data, color="r", kind="reg")
plt.show()
Based on the above co-relation chart, we would like to take into consider the features which are more closely related to the target value. The features in consideration are:
Now let us visualize the distribution of the 4 selected features in a pairplot
In [9]:
# Pair plot of the features
sns.pairplot(boston_data[['indus','rm','ptratio','lstat','medv']])
plt.show()
Now let us plot a distribution chart of the selected features. This would help us understand the distribution of the data a little better.
In [10]:
fig = plt.figure(figsize=(14,7))
plt.subplot(2,2,1)
sns.distplot(boston_data['indus'], rug=True, color="b")
plt.axvline(boston_data['indus'].mean(), color="b", linestyle='solid', linewidth=2)
plt.axvline(boston_data['indus'].median(), color="b", linestyle='dashed', linewidth=2)
plt.subplot(2,2,2)
sns.distplot(boston_data['rm'], rug=True, color="r")
plt.axvline(boston_data['rm'].mean(), color="r", linestyle='solid', linewidth=2)
plt.axvline(boston_data['rm'].median(), color="r", linestyle='dashed', linewidth=2)
plt.subplot(2,2,3)
sns.distplot(boston_data['ptratio'], rug=True, color="g")
plt.axvline(boston_data['ptratio'].mean(), color="g", linestyle='solid', linewidth=2)
plt.axvline(boston_data['ptratio'].median(), color="g", linestyle='dashed', linewidth=2)
plt.subplot(2,2,4)
sns.distplot(boston_data['lstat'], rug=True, color="y")
plt.axvline(boston_data['lstat'].mean(), color="y", linestyle='solid', linewidth=2)
plt.axvline(boston_data['lstat'].median(), color="y", linestyle='dashed', linewidth=2)
plt.show()
From the above dist plots we can conclude that the distribution of the data in 3 out of 4 features have skewed data distribution. The feature : RM is currently having the distribution in a somewhat normalized fashion.
PTRATIO is negatively skewed; LSTAT and INDUS are positively skewed
Now we need to normalize these data sets to bring the data into a normal distribution.
In [27]:
fig = plt.figure(figsize=(14,7))
plt.subplot(2,2,1)
sns.distplot(np.log(boston_data['indus']), rug=True, color="b")
plt.axvline(np.log(boston_data['indus']).mean(), color="b", linestyle='solid', linewidth=2)
plt.axvline(np.log(boston_data['indus']).median(), color="b", linestyle='dashed', linewidth=2)
plt.subplot(2,2,2)
sns.distplot(boston_data['rm'], rug=True, color="r")
plt.axvline(boston_data['rm'].mean(), color="r", linestyle='solid', linewidth=2)
plt.axvline(boston_data['rm'].median(), color="r", linestyle='dashed', linewidth=2)
plt.subplot(2,2,3)
sns.distplot(np.log(boston_data['ptratio']), rug=True, color="g")
plt.axvline(np.log(boston_data['ptratio']).mean(), color="g", linestyle='solid', linewidth=2)
plt.axvline(np.log(boston_data['ptratio']).median(), color="g", linestyle='dashed', linewidth=2)
plt.subplot(2,2,4)
sns.distplot(np.log(boston_data['lstat']), rug=True, color="y")
plt.axvline(np.log(boston_data['lstat']).mean(), color="y", linestyle='solid', linewidth=2)
plt.axvline(np.log(boston_data['lstat']).median(), color="y", linestyle='dashed', linewidth=2)
plt.show()
After applying logarithm test to the above data set, it seems that only LSTAT is responding correctly and is getting normally distributed. PTRATIO and INDUS is not having any significant impact in the normalization.
Now let us examine the co-relation between the features and the MEDV feature:
In [51]:
fig = plt.figure(figsize=(14,7))
plt.subplot(2,2,1)
x = np.log(boston_data[['indus']])
sns.regplot(x=x, y="medv", data=boston_data, color="b")
plt.subplot(2,2,2)
x2 = boston_data[['rm']]
sns.regplot(x=x2, y="medv", data=boston_data, color="r")
plt.subplot(2,2,3)
x3 = np.log(boston_data[['ptratio']])
sns.regplot(x=x3, y="medv", data=boston_data, color="g")
plt.subplot(2,2,4)
x4 = np.log(boston_data[['lstat']])
sns.regplot(x=x4, y="medv", data=boston_data, color="y")
plt.show()
In [92]:
boston_data['lstat_log']=np.log(boston_data['lstat'])
boston_data_test['lstat_log_test']=np.log(boston_data_test['lstat'])
#boston_data['ptratio_log']=np.log(boston_data['ptratio'])
#boston_data_test['ptratio_log_test']=np.log(boston_data_test['ptratio'])
#boston_data['indus_log']=np.log(boston_data['indus'])
#boston_data_test['indus_log_test']=np.log(boston_data_test['indus'])
X = boston_data[['rm','lstat_log']]
X_bd_test=boston_data_test[['rm','lstat_log_test']]
In [93]:
y = boston_data[['medv']]
In [94]:
from sklearn.model_selection import train_test_split
X_train, X_cv, y_train, y_cv = train_test_split(X, y, random_state=0)
print(len(X_train), len(y_train), len(X_cv), len(y_cv))
In [95]:
from sklearn.tree import DecisionTreeRegressor
max_score = 0
max_depth = 0
def decision_tree(j):
dtr = DecisionTreeRegressor(random_state=0,max_depth=j)
return dtr.fit(X_train, y_train)
for i in range(1,11):
_dtr = decision_tree(i)
clf_score = _dtr.score(X_cv,y_cv)
print("Decision Tree Regressor at max_depth:",i," scored: ",clf_score)
if clf_score>max_score:
max_score = clf_score
max_depth = i
In [96]:
print("The maximum score is achieved at a depth of : ",max_depth," with score of :",max_score)
In [97]:
dtr_clf = decision_tree(max_depth)
In [98]:
sns.barplot(X_train.columns, dtr_clf.feature_importances_)
plt.show()
We can conclude that rm and lstat are two of them most important factor in the prices of the house in boston area.
In [99]:
from IPython.display import Image
import pydotplus
from sklearn.externals.six import StringIO
from sklearn import tree
dot_data = StringIO()
tree.export_graphviz(dtr_clf, out_file=dot_data,
feature_names=X_train.columns,
class_names="medv",
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
Out[99]:
In [100]:
bd_predict = dtr_clf.predict(X_bd_test)
plt.scatter(boston_data_test['ID'],bd_predict)
plt.show()
In [101]:
print ("Mean Price value before modelling:",boston_data['medv'].mean())
print ("Mean Price value after modelling :",bd_predict.mean())
In [102]:
submission = pd.DataFrame({
"ID": boston_data_test['ID'],
"medv": bd_predict
})
submission.to_csv(data_path+'output.csv', index=False)
In [ ]: