In [ ]:
tree1 = tree.DecisionTreeClassifier( criterion ='entropy',random_state = 0)
fittedtree=tree1.fit( X_train, y_train)
#print metrics.confusion_matrix(y_train, fittedtree.predict(X_train))
#print metrics.accuracy_score(y_train, fittedtree.predict(X_train))
cross_validation.cross_val_score(tree1, X_train, y_train, cv = 10).mean()
#tree.export_graphviz(fittedtree, out_file =' tree.dot', feature_names =['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
In [1]:
import os
import math
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import feature_selection, linear_model
In [2]:
df = pd.read_csv('C:/Users/Collin/Documents/Python_Projects/TxDOT/data/Bid_Data.csv')
len(df)
Out[2]:
In [3]:
df = df[df['Rank'] == 1]
df = df[df['Type'] == 'Construction']
df = df.drop(' From', 1)
df = df.drop('To', 1)
df = df.drop('Contract Description', 1)
df = df.drop('Contractor', 1)
df = df.drop('Contract Category', 1)
df = df.drop('LNG MON', 1)
df = df.drop('MONTH', 1)
df['Award Amount'] = df['Award Amount'].str.lstrip('$')
df['Engineers Estimate'] = df['Engineers Estimate'].str.lstrip('$')
df['Award Amount'] = df['Award Amount'].str.replace(',','').astype(float)
df['Engineers Estimate'] = df['Engineers Estimate'].str.replace(',','').astype(float)
#Renaming Variables
df['EngEst'] = df['Engineers Estimate']
df['NBidders'] = df['Number of Bidders']
df['Date'] = pd.to_datetime(df['Letting Date'])
df.set_index('Date' , inplace=True)
df['Year'] = df.index.year
df['Month'] = df.index.month
df['WinBid'] = df['Award Amount']
# Creating New Varialbes
df['Diff'] = df['EngEst'] - df['WinBid']
df['lnWinBid'] = np.log(df['WinBid'])
df['lnEngEst'] = np.log(df['EngEst'])
df['DiffLn'] = df['lnWinBid'] - df['lnEngEst']
df['Within10Percent'] = 1
df['PercentOff'] = df['Diff'] / df['EngEst']
df['MoreOrLessThan10'] = 0
df['LessThan10'] = 0
df['MoreThan10'] = 0
df.loc[(df.PercentOff > .10) , 'Within10Percent'] = 0
df.loc[(df.PercentOff < -.10) , 'Within10Percent'] = 0
df.loc[(df.PercentOff > .10) , 'MoreOrLessThan10'] = 1
df.loc[(df.PercentOff < -.10) , 'MoreOrLessThan10'] = 2
df.loc[(df.PercentOff > .10) , 'MoreThan10'] = 1
df.loc[(df.PercentOff < -.10) , 'LessThan10'] = 1
print len(df)
In [4]:
sns.jointplot(x="EngEst", y="WinBid", data=df, kind="reg"); sns.jointplot(x="lnEngEst", y="lnWinBid", data=df, kind="reg");
Because the TxDot Estimates are so highly correlated with the winning bid, we can use it as a baseline for the model predictions
My idea was to classify the TxDot Estimates according to their scoring method of estimating within 10% of the winning bid and then having 3 separate regressive models for the 3 classes (within 10%, more than 10%, less than 10%).
TxDot's goal for their estimates is to have 55% of their estimates within 10% of the winning bid.
TxDot considers bids over or under the estimate by more than 10% as bad
We can then use this score to judge our model predictions
In [6]:
#Using ALL the Data
Percent = float(df.Within10Percent.sum()) / len(df)
print (Percent)*100 , '% of All the TxDOT estimates were within 10% of actual bid'
Percent_April_2016 = float(df[(df.Year == 2016) & (df.Month == 4)].Within10Percent.sum()) / len(df_test)
print (Percent_April_2016)*100 , '% of the April 2016 TxDOT estimates were within 10% of actual bid'
In [7]:
cmap = {'0': 'g', '1': 'r', '2': 'b' }
df['cMoreOrLessThan10'] = df.MoreOrLessThan10.apply(lambda x: cmap[str(x)])
print df.plot('EngEst', 'WinBid', kind='scatter', c=df.cMoreOrLessThan10)
In [8]:
cmap = {'0': 'g', '1': 'r', '2': 'b' }
df['cMoreOrLessThan10'] = df.MoreOrLessThan10.apply(lambda x: cmap[str(x)])
print df.plot('lnEngEst', 'lnWinBid', kind='scatter', c=df.cMoreOrLessThan10)
In [5]:
df_test = df[(df.Year == 2016) & (df.Month == 4)]
print len(df_test) , 'projects in April 2016'
df_train = df[(df.Year != 2016) | (df.Month != 4)]
print len(df_train) ,'projects from Jan 2010 to April 2016'
#df_train[['Year','Month']].tail()
In [242]:
#df_test.columns
In [9]:
#names_x = ['Length','Year','Month','lnEngEst','Time', 'Highway', 'District', 'County' ]
names_x = ['Length','Year','Month','lnEngEst','Time', 'NBidders']
names_y = ['MoreOrLessThan10']
df_Train_x = df_train[ names_x ]
df_Train_y = df_train[ names_y ]
df_Test_x = df_test[ names_x ]
df_Test_y = df_test[ names_y ]
df_Train_x.head()
Out[9]:
In [260]:
test_X.head()
Out[260]:
In [10]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(df_Train_x, df_Train_y)
In [12]:
clf.predict(df_Test_x)
Out[12]:
In [13]:
clf.score(df_Test_x, df_Test_y)
Out[13]:
In [274]:
#subsets the training data to just those who were within 10% of the TxDot Estimate
df_train_within = df[df.Within10Percent == 1]
model_3 = smf.ols(formula = 'lnWinBid ~ lnEngEst+Year+Month+Year*Month+NBidders+NBidders*Year+Time', data = df_train_within).fit()
model_3.summary()
Out[274]:
In [275]:
df_test.loc[:,'Lnprediction_within'] = model_3.predict(test_X)
df_test.Lnprediction_within.head()
Out[275]:
In [276]:
#subsets the training data to just those who were more than 10% of the TxDot Estimate
df_train_more = df[df.MoreThan10 == 1]
model_4 = smf.ols(formula = 'lnWinBid ~ lnEngEst+Year+Month+Year*Month+NBidders+NBidders*Year+Time', data = df_train_more).fit()
model_4.summary()
Out[276]:
In [277]:
df_test.loc[:,'Lnprediction_more'] = model_4.predict(test_X)
In [278]:
#subsets the training data to just those who were less than 10% of the TxDot Estimate
df_train_less = df[df.LessThan10 == 1]
model_5 = smf.ols(formula = 'lnWinBid ~ lnEngEst+Year+Month+Year*Month+NBidders+NBidders*Year+Time', data = df_train_less).fit()
model_5.summary()
Out[278]:
In [279]:
df_test.loc[:,'Lnprediction_less'] = model_5.predict(test_X)
In [280]:
#df_test.columns
df_test['Hyp_More'] = 1 df_test['Hyp_Less'] = 1 df_test['Hyp_Within'] = 1
In [283]:
df_test[['p_more', 'p_less', 'p_within', 'Lnprediction_within', 'Lnprediction_more', 'Lnprediction_less']].head()
Out[283]:
In [284]:
df_test.loc[:,'lnpred'] = df_test.p_within*df_test.Lnprediction_within + df_test.p_more*df_test.Lnprediction_more + df_test.p_less*df_test.Lnprediction_less
In [285]:
df_test.loc[:,'BidPrediction'] = np.exp(df_test.loc[:,'lnpred'])
In [286]:
df_test.loc[:,'PredDiff'] = df_test.loc[:,'BidPrediction'] - df_test.loc[:,'WinBid']
df_test.loc[:,'PredPercentOff'] = df_test.loc[:,'PredDiff'] / df_test.loc[:,'BidPrediction']
df_test.loc[:,'PredWithin10Percent'] = 1
df_test.loc[(df_test.PredPercentOff > .10) , 'PredWithin10Percent'] = 0
df_test.loc[(df_test.PredPercentOff < -.10) , 'PredWithin10Percent'] = 0
In [287]:
ModelPercent = float(df_test.PredWithin10Percent.sum()) / len(df_test)
PercentIncrease = (ModelPercent)*100 - (Percent_April_2016)*100
NumberCorrectIncrease = (PercentIncrease/100)*len(df_test)
print (Percent_April_2016)*100 , '% of the TxDOT estimates were within 10% of actual bid'
print (ModelPercent)*100 , '% of the Model predictions were within 10% of actual bid'
print
print 'this is a increase of :', PercentIncrease, '%'
print 'or', NumberCorrectIncrease, 'more estimates within the 10% threshhold'
In [288]:
print 'In April 2016 TxDOT under estimated bids by: ' , df_test.Diff.sum()
print
print 'In April 2016 the Model under estimated bids by: ' ,df_test.PredDiff.sum()
print
print 'In April 2016 the model was ' , df_test.Diff.sum() - df_test.PredDiff.sum() , 'closer to the winning bids than TxDOT'
print
print 'The model predicted a sum of' ,df_test.BidPrediction.sum() ,'for all the projects in April 2016'
print
print 'TxDOT predicted a sum of' ,df_test.EngEst.sum() ,'for all the projects in April 2016'
In [289]:
df_test[['Diff','PredDiff']].std()
Out[289]:
In [290]:
df_test[['Diff','PredDiff']].describe()
Out[290]:
In [291]:
cmap = {'0': 'r', '1': 'g' }
df_test.loc[:,'cWithin10Percent'] = df_test.Within10Percent.apply(lambda x: cmap[str(x)])
print df_test.plot('lnEngEst', 'lnWinBid', kind='scatter', c=df_test.cWithin10Percent)
In [292]:
predcmap = {'0': 'r', '1': 'g' }
df_test.loc[:,'cPredWithin10Percent'] = df_test.PredWithin10Percent.apply(lambda x: predcmap[str(x)])
print df_test.plot('lnpred', 'lnWinBid', kind='scatter', c=df_test.cPredWithin10Percent)
In [ ]: