In [1]:
import os
import math
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import feature_selection, linear_model
In [16]:
df = pd.read_csv('data/Bid_Data.csv')
len(df)
Out[16]:
In [18]:
df = df[df['Rank'] == 1]
df = df[df['Type'] == 'Construction']
df = df.drop(' From', 1)
df = df.drop('To', 1)
df = df.drop('Contract Description', 1)
df = df.drop('Contractor', 1)
df = df.drop('Contract Category', 1)
df = df.drop('LNG MON', 1)
df = df.drop('MONTH', 1)
df['Award Amount'] = df['Award Amount'].str.lstrip('$')
df['Engineers Estimate'] = df['Engineers Estimate'].str.lstrip('$')
df['Award Amount'] = df['Award Amount'].str.replace(',','').astype(float)
df['Engineers Estimate'] = df['Engineers Estimate'].str.replace(',','').astype(float)
#Renaming Variables
df['EngEst'] = df['Engineers Estimate']
df['NBidders'] = df['Number of Bidders']
df['Date'] = pd.to_datetime(df['Letting Date'])
df.set_index('Date' , inplace=True)
df['Year'] = df.index.year
df['Month'] = df.index.month
df['WinBid'] = df['Award Amount']
# Creating New Varialbes
df['Diff'] = df['EngEst'] - df['WinBid']
df['lnWinBid'] = np.log(df['WinBid'])
df['lnEngEst'] = np.log(df['EngEst'])
df['DiffLn'] = df['lnWinBid'] - df['lnEngEst']
df['Within10Percent'] = 1
df['PercentOff'] = df['Diff'] / df['EngEst']
df['MoreOrLessThan10'] = 0
df['LessThan10'] = 0
df['MoreThan10'] = 0
df.loc[(df.PercentOff > .10) , 'Within10Percent'] = 0
df.loc[(df.PercentOff < -.10) , 'Within10Percent'] = 0
df.loc[(df.PercentOff > .10) , 'MoreOrLessThan10'] = 1
df.loc[(df.PercentOff < -.10) , 'MoreOrLessThan10'] = 2
df.loc[(df.PercentOff > .10) , 'MoreThan10'] = 1
df.loc[(df.PercentOff < -.10) , 'LessThan10'] = 1
print(len(df))
In [15]:
df
Out[15]:
In [26]:
sns.jointplot(x="EngEst", y="WinBid", data=df, kind="reg"); sns.jointplot(x="lnEngEst", y="lnWinBid", data=df, kind="reg");
In [20]:
cmap = {'0': 'g', '1': 'r', '2': 'b' }
df['cMoreOrLessThan10'] = df.MoreOrLessThan10.apply(lambda x: cmap[str(x)])
print (df.plot('lnEngEst', 'lnWinBid', kind='scatter', c=df.cMoreOrLessThan10))
In [24]:
df_test = df[(df.Year == 2016) & (df.Month == 4)]
print(len(df_test) , 'projects in April 2016')
df_train = df[(df.Year != 2016) | (df.Month != 4)]
print(len(df_train) ,'projects from Jan 2010 to April 2016')
#df_train[['Year','Month']].tail()
In [28]:
#Using ALL the Data
Percent = float(df.Within10Percent.sum()) / len(df)
print(round((Percent)*100,2) , '% of All the TxDOT estimates were within 10% of actual bid')
Percent_April_2016 = float(df[(df.Year == 2016) & (df.Month == 4)].Within10Percent.sum()) / len(df_test)
print (round((Percent_April_2016)*100,2) , '% of the April 2016 TxDOT estimates were within 10% of actual bid')
In [31]:
names_X = ['Length','NBidders','Year','Month','lnEngEst','Time']
def X_y(df):
X = df[ names_X ]
y_more = df['MoreThan10']
y_less =df['LessThan10']
return X, y_more, y_less
train_X, train_y_more, train_y_less = X_y(df_train)
test_X, test_y_more, test_y_less = X_y(df_test)
print(len(train_y_more))
print(len(train_y_less))
print(len(test_y_more))
print(len(test_y_less))
In [32]:
test_X.head()
Out[32]:
In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
def RFC_model(X, y):
""" Performs grid search over the 'n_estimators' parameter for a
random forest regressor trained on the input data [X, y]. """
# Create cross-validation sets from the training data
cv_sets = KFold(n_splits = 30)
cv_sets.split(X,y)
# Create a decision tree regressor object
clf = RandomForestClassifier()
# Create a dictionary for the parameter 'max_depth' with a range from 1 to 100
params = {'n_estimators':range(1,len(X.columns))}
# Transform 'performance_metric' into a scoring function using 'make_scorer'
#scoring_fnc = make_scorer(performance_metric)
# Create the grid search object
grid = GridSearchCV(clf, params, cv=cv_sets)
# Fit the grid search object to the data to compute the optimal model
grid = grid.fit(X, y)
# Return the optimal model after fitting the data
return grid.best_estimator_
In [46]:
model_1 = RFC_model(train_X,train_y_more)
In [47]:
print('correct training classification = ', model_1.score(train_X, train_y_more))
print ('correct testing classification = ', model_1.score(test_X, test_y_more))
In [ ]: