An exploratory data analysis

Connection to MongoDB


In [1]:
import pymongo
from pymongo import MongoClient
import time
Client = MongoClient("mongodb://nbi-mongo.admin/")
db = Client.bridge
collection = db["SampleNbi2"]

Bridge Records in Database


In [2]:
startTime = time.time()
print("Bridges Records in DB: ", collection.count())
print("Seconds : ", (time.time() - startTime))


Bridges Records in DB:  17509885
Seconds :  0.008536100387573242

Total Number of Bridges (Years)


In [3]:
import pandas as pd

listStates = []
startTime = time.time()
for i in collection.distinct("stateCode"):
    listStates.append(i)
print("Time to create list of distinct states in Seconds: ", (time.time() - startTime))

countPerYear = {}
startTime = time.time()
for i in collection.distinct("year"):
    countPerYear[i] =  collection.find({"year":i}).count()
print("Seconds: ",(time.time()- startTime))
count_per_year = pd.DataFrame(list(countPerYear.items()), columns = ['year', 'Distinct count of Bridges'])
count_per_year.set_index("year", inplace = True)
count_per_year


Time to create list of distinct states in Seconds:  0.014206171035766602
Seconds:  5.496885061264038
Out[3]:
Distinct count of Bridges
year
1992 707509
1993 700993
1994 703606
1995 713410
1996 711792
1997 715853
1998 716703
1999 721247
2000 724037
2001 727990
2002 730034
2003 732942
2004 736600
2005 740010
2006 740177
2007 748661
2008 751134
2009 746542
2010 632426
2011 632405
2012 634324
2013 634668
2014 637622
2015 638836
2016 642566

Trend of the count of records in the bridge


In [4]:
%matplotlib inline
print("Trend of the count of records in the bridge:")
count_per_year.plot(figsize=(10,8))
count_per_year.plot(kind = "bar", figsize=(10,8))


Trend of the count of records in the bridge:
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09bcd4ee48>

States with most bridges


In [5]:
countPerState = {}
startTime = time.time()
for i in listStates:
    countPerState[i] = collection.find({"year":2016, "stateCode":i}).count()
print("Seconds: ", (time.time() - startTime))
count_per_state = pd.DataFrame(list(countPerState.items()), columns = ['State Code', 'Count of Records'])
#count_per_state = count_per_state.set_index('State Code')


Seconds:  36.40173649787903

In [7]:
stateName = {'25':'MASSACHUSETTS',
             '04':'ARIZONA', 
             '08':'COLORADO',
             '38':'NORTH DAKOTA', 
             '09':'CONNECTICUT', 
             '19':'IOWA', 
             '26':'MICHIGAN', 
             '48':'TEXAS',
             '35':'NEW MEXICO',
             '17':'ILLINOIS', 
             '51':'VIRGINIA',
             '23':'MAINE',
             '16':'IDAHO',
             '36':'NEW YORK',
             '56':'WYOMING',
             '29':'MISSOURI',
             '39':'OHIO',
             '28':'MISSISSIPI', 
             '11':'DISTRICT OF COLOMBIA',
             '21':'KENTUCKY', 
             '18':'INDIANA',
             '06':'CALIFORNIA',
             '47':'TENNESSEE', 
             '12':'FLORIDA',
             '24':'MARYLAND',
             '34':'NEW JERSEY', 
             '46':'SOUTH DAKOTA',
             '13':'GEORGIA',
             '55':'WISCONSIN',
             '30':'MONTANA',
             '54':'WEST VIGINIA',
             '15':'HAWAII', 
             '32':'NEVADA', 
             '37':'NORTH CAROLINA',
             '10':'DELAWARE', 
             '33':'NEW HAMPSHIRE', 
             '44':'RHODE ISLAND',
             '50':'VERMONT', 
             '42':'PENNSYLVANIA', 
             '05':'ARKANSAS', 
             '20':'KANSAS', 
             '45':'SOUTH CAROLINA',
             '22':'LOUISIANA',
             '40':'OKLAHOMA', 
             '72':'PUERTO RICO', 
             '41':'OREGON',
             '21':'MINNESOTA', 
             '53':'WASHINGTON', 
             '01':'ALABAMA', 
             '31':'NEBRASKA',
             '02':'ALASKA', 
             '49':'UTAH'
               }

count_per_state['State Name'] =  count_per_state['State Code'].map(stateName)
count_per_state = count_per_state.set_index('State Name')
asc_c=count_per_state.sort_values(['Count of Records'],ascending = 0)
asc_c.head(n=10).sort_values(['Count of Records'], ascending = 0).plot(kind = 'bar', figsize=(10,8))


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09bcdbd898>

States with least bridges


In [8]:
asc_c.tail(n=10).sort_values(['Count of Records'], ascending = 1).plot(kind = 'bar', figsize=(10,8))


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09bcd3c160>

Year Built


In [9]:
pipeline = [{"$match":{"year":2016}},
            {'$group':{"_id":"$yearBuilt", "count":{"$sum":1}}}]
startTime = time.time()
yearBuiltResult = collection.aggregate(pipeline)
print("Seconds: ",(time.time() - startTime))
yearBuiltResult_df = pd.DataFrame(list(yearBuiltResult))


Seconds:  1.5601255893707275

Early bridges


In [10]:
yearBuiltResult_df = yearBuiltResult_df.rename(columns = {'_id': 'Year'})
sort_yearBuiltResult_df=yearBuiltResult_df.sort_values('count', ascending = 0)
sort_yearBuiltResult_df.set_index('Year').head(n=10).plot(kind = 'bar', figsize=(10,8))


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09bcb45c18>

Recently built bridges


In [11]:
sort_yearBuiltResult_rdf=yearBuiltResult_df.sort_values('Year', ascending = 0)
sort_yearBuiltResult_rdf.set_index('Year').head(n=10).plot(kind = "bar", figsize = (10,8))


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09b912c1d0>

Summary of NBI data


In [12]:
pipeline = [{"$match": {"year":2016}},
            {"$project":{"_id":0, "stateCode":1,"deckWidthOutToOut":1, "structureLength":1, "averageDailyTraffic":1 }}]
startTime = time.time()
pdc = collection.aggregate(pipeline)
print("Seconds : ", (time.time() - startTime))
nbi = pd.DataFrame(list(pdc))


Seconds :  0.07834887504577637

In [13]:
stateName = {'25':'MASSACHUSETTS',
             '04':'ARIZONA', 
             '08':'COLORADO',
             '38':'NORTH DAKOTA', 
             '09':'CONNECTICUT', 
             '19':'IOWA', 
             '26':'MICHIGAN', 
             '48':'TEXAS',
             '35':'NEW MEXICO',
             '17':'ILLINOIS', 
             '51':'VIRGINIA',
             '23':'MAINE',
             '16':'IDAHO',
             '36':'NEW YORK',
             '56':'WYOMING',
             '29':'MISSOURI',
             '39':'OHIO',
             '28':'MISSISSIPI', 
             '11':'DISTRICT OF COLOMBIA',
             '21':'KENTUCKY', 
             '18':'INDIANA',
             '06':'CALIFORNIA',
             '47':'TENNESSEE', 
             '12':'FLORIDA',
             '24':'MARYLAND',
             '34':'NEW JERSEY', 
             '46':'SOUTH DAKOTA',
             '13':'GEORGIA',
             '55':'WISCONSIN',
             '30':'MONTANA',
             '54':'WEST VIGINIA',
             '15':'HAWAII', 
             '32':'NEVADA', 
             '37':'NORTH CAROLINA',
             '10':'DELAWARE', 
             '33':'NEW HAMPSHIRE', 
             '44':'RHODE ISLAND',
             '50':'VERMONT', 
             '42':'PENNSYLVANIA', 
             '05':'ARKANSAS', 
             '20':'KANSAS', 
             '45':'SOUTH CAROLINA',
             '22':'LOUISIANA',
             '40':'OKLAHOMA', 
             '72':'PUERTO RICO', 
             '41':'OREGON',
             '21':'MINNESOTA', 
             '53':'WASHINGTON', 
             '01':'ALABAMA', 
             '31':'NEBRASKA',
             '02':'ALASKA', 
             '49':'UTAH'
               }

nbi['State Name'] =  nbi['stateCode'].map(stateName)

In [14]:
nbi['deckArea']= nbi['deckWidthOutToOut'] * nbi['structureLength']
nbi_summary = nbi[['State Name','deckWidthOutToOut','structureLength','deckArea','averageDailyTraffic']]
nbi_summary = nbi_summary.groupby(['State Name']).agg({'State Name':'count',
                                'deckArea':'sum',
                                'averageDailyTraffic':'sum'})
nbi_summary = nbi_summary.rename(columns={'State Name': 'Count', 'deckArea':'Sum of Deck Area','averageDailyTraffic':
                                         'Sum of ADT'})

In [15]:
nbi_summary


Out[15]:
Count Sum of Deck Area Sum of ADT
State Name
ALABAMA 16098 9238040.53 76613580
ALASKA 1488 710703.90 3357512
ARIZONA 8154 5055845.20 97256999
ARKANSAS 12871 6381829.40 49636105
CALIFORNIA 25431 29478067.98 667205896
COLORADO 8680 4926475.88 68595652
CONNECTICUT 4214 3265916.52 78275293
DELAWARE 877 978596.38 11361551
DISTRICT OF COLOMBIA 245 568827.33 7660611
FLORIDA 12313 16759415.81 213337215
GEORGIA 14835 9404058.27 135068507
HAWAII 1132 1319916.61 27400179
IDAHO 4445 1723198.53 11566802
ILLINOIS 26704 13004416.75 133709726
INDIANA 19245 7997458.47 97813523
IOWA 24184 8248470.40 34260303
KANSAS 25013 8192548.56 46401837
LOUISIANA 12915 16387705.86 80282307
MAINE 2450 1217855.97 11420274
MARYLAND 5321 5155225.32 116163051
MASSACHUSETTS 5171 4055694.07 114458271
MICHIGAN 11156 6357402.03 93784086
MINNESOTA 14265 6053639.35 64332373
MISSISSIPI 17068 9093623.83 43837769
MISSOURI 24468 10361960.26 86300007
MONTANA 5276 2000703.82 10505792
NEBRASKA 15334 3969666.40 22562432
NEVADA 1933 1595543.65 32174713
NEW HAMPSHIRE 2486 1120425.98 16910083
NEW JERSEY 6730 6878062.72 160237725
NEW MEXICO 3973 1728319.62 26782628
NEW YORK 17462 12869629.22 169682377
NORTH CAROLINA 18099 9221997.64 112602177
NORTH DAKOTA 4400 1232040.30 4423736
OHIO 56568 26635142.36 354235716
OKLAHOMA 23053 8350748.75 70967585
OREGON 8118 5017158.18 56756580
PENNSYLVANIA 22791 12377122.98 163719521
PUERTO RICO 2205 1973541.98 38053215
RHODE ISLAND 772 786182.40 15845827
SOUTH CAROLINA 9358 6800337.44 46154561
SOUTH DAKOTA 5849 1664936.80 7112750
TENNESSEE 20123 9659962.24 156431834
TEXAS 53488 47253738.73 543042520
UTAH 3039 1922455.87 63527662
VERMONT 2766 853119.53 6841770
VIRGINIA 13892 9620708.82 122500324
WASHINGTON 8178 6890269.73 66350712
WEST VIGINIA 7217 3805059.64 24843795
WISCONSIN 14230 6706707.15 78666403
WYOMING 3128 1269968.77 7350946

Bridge Counts across states


In [16]:
nbi_summary['Count'].plot(kind = 'bar', figsize = (11,9))


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09b90175f8>

Deck Area Summary across states


In [17]:
nbi_summary['Sum of Deck Area'].plot(kind = 'bar', figsize = (11,9))


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09a0e249e8>

Summary of Average Daily Traffic across states


In [18]:
nbi_summary['Sum of ADT'].plot(kind = 'bar', figsize = (11,9))


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09b8be2160>

Breakdown of Agencies Responsible for the maintenance of U.S. Highway Bridges


In [33]:
pipeline = [{"$match": {"year":2016}},
            {"$project":{"_id":0, "maintenanceReponsibility":1,"deckWidthOutToOut":1, "structureLength":1, "averageDailyTraffic":1 }}]
startTime = time.time()
pdc1 = collection.aggregate(pipeline)
print("Seconds : ", (time.time() - startTime))
nbi_main = pd.DataFrame(list(pdc1))


Seconds :  0.07691693305969238

In [34]:
maintenanceReponsibility ={ -1 : 'NA',
                      1 : 'State Highway Agency',
                      21: 'Other State Agency',
                      4 : 'City or Municipal Highway Agency',
                      80: 'Unknown',
                      66: 'National Park Service',
                      2 : 'County Highway Agency',
                      60: 'Other Federal Agencies (not listest below)',
                      64: 'U.S forest Services' ,
                      68: 'Bureau of Land Management',
                      26: 'Private (other than railroad)',
                      62: 'Bureau of Indian Affairs', 
                      3 : 'Town or Township Highway Agency',
                      25: 'Other Local Agencies',
                      11: 'State Park, Forest or Reservation Agency',
                      63: 'Bureau of Fish and Wildlife',
                      27: 'Railroad',
                      74: 'Army',
                      70: 'Corps of Engineers (Civil)',
                      72: 'Air Force',
                      61: 'Indian Tribal Agency',
                      71: 'Corps of Engineers (Military)',
                      69: 'Bureau of Reclamation',
                      67: 'Tennesssee Valley Authority',
                      32: 'Local Toll Authority',
                      12: 'Local Park, Forest or Reservation Agency',
                      31: 'State Toll Authority',
                      73: 'Navy / Marines',
                      75: 'NASA',
                      76: 'Metropolitian Washington Airports Service'
}
  
nbi_main['Maintenance Reponsibility'] =  nbi_main['maintenanceReponsibility'].map(maintenanceReponsibility)
nbi_main['deckArea']= nbi_main['deckWidthOutToOut'] * nbi_main['structureLength']
nbi_main_maintenance=nbi_main.groupby(['Maintenance Reponsibility']).agg({'Maintenance Reponsibility':'count',
                                'deckArea':'sum',
                                'averageDailyTraffic':'sum'})
mant_valid_bridges = nbi_main_maintenance['Maintenance Reponsibility'].sum()
mant_sum_deck = nbi_main_maintenance['deckArea'].sum()
mant_sum_adt = nbi_main_maintenance['averageDailyTraffic'].sum()
mant_valid_bridges = nbi_main_maintenance['Maintenance Reponsibility'].sum()
mant_sum_deck = nbi_main_maintenance['deckArea'].sum()
mant_sum_adt = nbi_main_maintenance['averageDailyTraffic'].sum()

In [35]:
nbi_main_maintenance['Percent Valid Bridge']= (nbi_main_maintenance['Maintenance Reponsibility']/mant_valid_bridges)*100
nbi_main_maintenance['Percent Deck Area']= (nbi_main_maintenance['deckArea'] / mant_sum_deck)*100
nbi_main_maintenance['Percent ADT']= (nbi_main_maintenance['averageDailyTraffic'] / mant_sum_adt)*100
nbi_main_maintenance['deckArea'] = nbi_main_maintenance['deckArea'].apply(lambda x: '{:.2f}'.format(x))

nbi_main_maintenance = nbi_main_maintenance.rename(columns = {"Maintenance Reponsibilty":"Count",
                                                              "deckArea":"Sum of Deck Area",
                                                               "averageDailyTraffic":"Sum of ADT"})

In [36]:
nbi_main_maintenance


Out[36]:
Maintenance Reponsibility Sum of Deck Area Sum of ADT Percent Valid Bridge Percent Deck Area Percent ADT
Maintenance Reponsibility
Air Force 287 121469.57 449042 0.044665 0.031584 0.009352
Army 985 219463.57 711242 0.153292 0.057063 0.014812
Bureau of Fish and Wildlife 304 41538.49 25695 0.047310 0.010801 0.000535
Bureau of Indian Affairs 944 248088.63 552992 0.146911 0.064506 0.011517
Bureau of Land Management 481 79536.58 20686 0.074856 0.020681 0.000431
Bureau of Reclamation 146 40043.25 61985 0.022721 0.010412 0.001291
City or Municipal Highway Agency 46975 26482462.24 310286133 7.310533 6.885786 6.462026
Corps of Engineers (Civil) 245 366675.07 428039 0.038128 0.095340 0.008914
County Highway Agency 244552 49576983.27 240400073 38.058659 12.890663 5.006577
Indian Tribal Agency 39 3607.57 3880 0.006069 0.000938 0.000081
Local Park, Forest or Reservation Agency 81 18803.75 187565 0.012606 0.004889 0.003906
Local Toll Authority 1158 4880722.87 26862081 0.180215 1.269052 0.559430
Metropolitian Washington Airports Service 51 46870.62 776200 0.007937 0.012187 0.016165
NA 23 13544.39 91002 0.003579 0.003522 0.001895
NASA 6 20869.13 20428 0.000934 0.005426 0.000425
National Park Service 1313 593206.15 4156536 0.204337 0.154241 0.086564
Navy / Marines 214 147731.32 309184 0.033304 0.038412 0.006439
Other Federal Agencies (not listest below) 14 14362.89 63484 0.002179 0.003735 0.001322
Other Local Agencies 1548 1519572.29 4464528 0.240909 0.395109 0.092978
Other State Agency 778 662541.42 4277990 0.121077 0.172269 0.089094
Private (other than railroad) 776 1076508.62 11389681 0.120766 0.279906 0.237202
Railroad 530 251812.41 2086452 0.082482 0.065475 0.043452
State Highway Agency 296467 278558303.91 3962699890 46.137984 72.428798 82.527277
State Park, Forest or Reservation Agency 1180 228839.46 658890 0.183639 0.059501 0.013722
State Toll Authority 8187 14377100.30 211637887 1.274110 3.738234 4.407575
Tennesssee Valley Authority 36 100292.60 114529 0.005603 0.026077 0.002385
Town or Township Highway Agency 30032 4291058.24 18653453 4.673761 1.115731 0.388477
U.S forest Services 5194 597138.73 267702 0.808322 0.155264 0.005575
Unknown 20 16913.49 27774 0.003113 0.004398 0.000578

In [22]:
nbi_main_maintenance['Percent Valid Bridge'].plot.bar(figsize = (10,8))


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09b8ab2898>

In [23]:
nbi_main_maintenance['Percent ADT'].plot.bar(figsize = (10,8))


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09b830ec88>

In [24]:
nbi_main_maintenance['Percent Deck Area'].plot.bar(figsize = (10,8))


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09b8612048>

Structure Material and Type of Construction


In [25]:
pipelineSM = [{"$match":{"year":2016}},
               {"$group":{"_id":"$structureTypeMain", "count":{"$sum":1}}}]
startTime = time.time()
SM=collection.aggregate(pipelineSM)
print("Seconds: ",(time.time() - startTime))
SM_df = pd.DataFrame(list(SM))
structMat_Type= SM_df.sort_values('count',ascending = 0).head(20)
structMat_Type['kindOfMaterialDesign'] = structMat_Type._id.apply(lambda x: x.get('kindOfMaterialDesign'))
structMat_Type['typeOfDesignConstruction'] = structMat_Type._id.apply(lambda x: x.get('typeOfDesignConstruction'))


Seconds:  2.0784292221069336

In [26]:
Material ={ -1 : 'NA',
                    1 : 'Concrete',
                    2 : 'Concrete Continuous',
                    3 : 'Steel',
                    4 : 'Steel Continuous',
                    5 : 'Prestressed concrete',
                    6 : 'Prestressed concrete continuous',
                    7 : 'Wood or Timber',
                    8 : 'Masonry',
                    9 : 'Aluminium, Wrought, Iron, Cast Iron',
                    0 : 'Other'
}
typeOfDesign  = { -1 : 'NA',
                    1 : 'Slab',
                    2 : 'Stringer / Multi-beam or Girder',
                    3 : 'Gider and Floorbeam System',
                    4 : 'Tee Beam',
                    5 : 'Box Beam or Girder - Multiple',
                    6 : 'Box Beam or Girder - Single or Spread ',
                    7 : 'Frame',
                    8 : 'Orthotropic',
                    9 : 'Truss - Deck',
                   10 : 'Truss - Thru',
                   11 : 'Arch - Deck', 
                   13 : 'Suspension',
                   14 : 'Stayed Girder',
                   15 : 'Movable - Lift',
                   16 : 'Movable - Bascule',
                   17 : 'Movabale - swing',
                   18 : 'Tunnel',
                   19 : 'Culvert',
                   20 : 'Mixed Type',
                   21 : 'Segmental Box Girder',
                   22 : 'Channel Beam',
                   0 : 'Other'
                }
structMat_Type['Material'] = structMat_Type['kindOfMaterialDesign'].map(Material)
structMat_Type['Type of Construction'] = structMat_Type['typeOfDesignConstruction'].map(typeOfDesign)
cols = ['Material','Type of Construction','count']
structMat_Type[cols]


Out[26]:
Material Type of Construction count
58 Steel Stringer / Multi-beam or Girder 102750
54 Concrete Culvert 93248
73 Prestressed concrete Stringer / Multi-beam or Girder 57734
52 Steel Continuous Stringer / Multi-beam or Girder 53597
82 Prestressed concrete Box Beam or Girder - Multiple 49386
67 Concrete Continuous Slab 35238
100 Concrete Slab 33991
106 Concrete Continuous Culvert 29097
97 Concrete Tee Beam 20432
47 Prestressed concrete continuous Stringer / Multi-beam or Girder 17030
88 Wood or Timber Stringer / Multi-beam or Girder 15330
87 Steel Culvert 15325
89 Concrete Channel Beam 12958
98 Prestressed concrete Slab 12217
80 Concrete Stringer / Multi-beam or Girder 9567
66 Steel Truss - Thru 8993
65 Prestressed concrete Tee Beam 8436
70 Concrete Continuous Tee Beam 6501
109 Prestressed concrete Box Beam or Girder - Single or Spread 5888
122 Concrete Frame 5610

In [27]:
cols = ['Material','Type of Construction','count']
structMat_Type[cols]
structMat_Type['Material / Type of Construction'] = structMat_Type['Material'].astype(str)+" - "+structMat_Type['Type of Construction']

In [28]:
cols = ['Material / Type of Construction', 'count']
structMat_Type = structMat_Type[cols]
structMat_Type = structMat_Type.set_index('Material / Type of Construction')
structMat_Type.plot(kind = 'bar', figsize=(11,9))


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09b88bee80>

Scour Critical Bridges


In [29]:
pipeline = [{"$match": {"year":2016}},
            {"$project":{"_id":0, "stateCode":1,"scourCriticalBridges":1}}]
startTime = time.time()
scour = collection.aggregate(pipeline)
print("Seconds : ", (time.time() - startTime))
scour_df = pd.DataFrame(list(scour))


Seconds :  0.15120792388916016

In [30]:
scour_df['scourCriticalStatus'] = None
for index, row in scour_df.iterrows():
    if row.scourCriticalBridges in ["0","1","2","3"]:
        scour_df.loc[index, 'scourCriticalStatus'] = "True"
    else:
        scour_df.loc[index, 'scourCriticalStatus'] = "False"

In [37]:
scour_df.groupby(['scourCriticalStatus'])['scourCriticalStatus'].count().plot.bar()


Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f09b5e9dfd0>

In [ ]: