notebook.community

Edit and run



In [4]:

    
import pandas as pd
import json
import numpy as np



In [5]:

    
data = pd.read_csv('../python/data/lfb_jan2013-mar2016 original.csv',sep=",")
data[:2]









    Out[5]:






  
    
      
      IncidentNumber
      DateOfCall
      TimeOfCall
      IncidentGroup
      StopCodeDescription
      SpecialServiceType
      PropertyCategory
      PropertyType
      AddressQualifier
      Postcode_full
      ...
      Easting_rounded
      Northing_rounded
      FRS
      IncidentStationGround
      FirstPumpArriving_AttendanceTime
      FirstPumpArriving_DeployedFromStation
      SecondPumpArriving_AttendanceTime
      SecondPumpArriving_DeployedFromStation
      NumStationsWithPumpsAttending
      NumPumpsAttending
    
  
  
    
      0
      1131
      1.I.13
      0:02:06
      False Alarm
      AFA
      NaN
      Other Residential
      Boarding House/B&B for homeless/asylum seekers
      Correct incident address
      NW6 1PG
      ...
      525450
      184850
      London
      West Hampstead
      167
      West Hampstead
      NaN
      NaN
      1
      1
    
    
      1
      4131
      1.I.13
      0:02:09
      False Alarm
      AFA
      NaN
      Non Residential
      Single shop
      Correct incident address
      UB6 0HY
      ...
      515450
      185450
      London
      Northolt
      236
      Northolt
      NaN
      NaN
      1
      1
    
  

2 rows × 27 columns



In [6]:

    
##Formating data for machine learning

#no null values in easting and northing
cleanedData = data[data["Easting_m"].notnull() & data["FirstPumpArriving_AttendanceTime"].notnull()][["Easting_m","Northing_m","IncidentGroup","TimeOfCall","DateOfCall","FirstPumpArriving_AttendanceTime"]]

#convert easting northing to lat/lon
#might need to install BNG, -> pip install utm
import pyproj

bng = pyproj.Proj(init='epsg:27700')
wgs84 = pyproj.Proj(init='epsg:4326')

# AL1 1AB - pyproj.transform(from,to,easting,northing)
lon,lat = pyproj.transform(bng,wgs84, cleanedData["Easting_m"].values, cleanedData["Northing_m"].values)
cleanedData["lat"] = lat;
cleanedData["lon"] = lon;
#cleanedData

#adding hours of call to cleandedData
lst = cleanedData["TimeOfCall"].str.split(':').tolist()
timeCall = [np.int_(item[0]) for item in lst]
cleanedData["HourOfCall"] = timeCall

#converting IncidentGroup to either Fire = 1 or nonFire = 0
incidentDic = {"Fire": 1, "False Alarm" : 0, "Special Service" : 0, np.nan : 0}
cleanedData["fireFlag"] = [str(incidentDic[item]) for item in cleanedData["IncidentGroup"]]
#incidentDic[cleanedData["IncidentGroup"][0]]
#pd.isnan(cleanedData["IncidentGroup"])

from datetime import date
import roman
#Getting the day of week when the accident happend
#day.month.year is the format from csv
#monday = 0, sunday = 6
lst = cleanedData["DateOfCall"].str.split('.').tolist()
cleanedData["DayOfWeek"] = [date(2000+int(i[2]),roman.fromRoman(i[1]),int(i[0])).weekday() for i in lst]
cleanedData[:10]









    Out[6]:






  
    
      
      Easting_m
      Northing_m
      IncidentGroup
      TimeOfCall
      DateOfCall
      FirstPumpArriving_AttendanceTime
      lat
      lon
      HourOfCall
      fireFlag
      DayOfWeek
    
  
  
    
      0
      525424
      184894
      False Alarm
      0:02:06
      1.I.13
      167
      51.549006
      -0.192508
      0
      0
      1
    
    
      1
      515405
      185445
      False Alarm
      0:02:09
      1.I.13
      236
      51.556094
      -0.336755
      0
      0
      1
    
    
      2
      522456
      178647
      False Alarm
      0:02:54
      1.I.13
      218
      51.493515
      -0.237460
      0
      0
      1
    
    
      3
      527814
      181016
      Fire
      0:03:02
      1.I.13
      426
      51.513620
      -0.159464
      0
      1
      1
    
    
      4
      533338
      180736
      False Alarm
      0:03:03
      1.I.13
      346
      51.509828
      -0.080011
      0
      0
      1
    
    
      5
      520789
      186645
      Fire
      0:05:16
      1.I.13
      323
      51.565753
      -0.258720
      0
      1
      1
    
    
      6
      525045
      184031
      False Alarm
      0:05:24
      1.I.13
      448
      51.541334
      -0.198278
      0
      0
      1
    
    
      9
      508652
      176984
      False Alarm
      0:16:39
      1.I.13
      274
      51.481386
      -0.436728
      0
      0
      1
    
    
      15
      532802
      181138
      False Alarm
      0:38:07
      1.I.13
      235
      51.513567
      -0.087579
      0
      0
      1
    
    
      16
      534273
      165917
      Fire
      0:50:18
      1.I.13
      287
      51.376437
      -0.072162
      0
      1
      1



In [49]:

    
from sklearn.cluster import MiniBatchKMeans

#create new dataframe for output dataset
df = pd.DataFrame(data = cleanedData, columns=["lat","lon"])

input = df.values.tolist()
output = input

cluster_centers = {};

for i in range(2,7):
    print "k-" + str(i)
    mbk = MiniBatchKMeans(init='k-means++', n_clusters=i, batch_size=45,
                          n_init=10, max_no_improvement=10, verbose=0)
    mbk.fit(input)
    mbk_means_labels = mbk.labels_
    mbk_means_cluster_centers = mbk.cluster_centers_
    mbk_means_labels_unique = np.unique(mbk_means_labels)

    
    #print mbk_means_cluster_centers

    cleanedData["kmeans"+str(i)] = mbk_means_labels
    center_list = mbk_means_cluster_centers.tolist()
    
    clist = [item[::-1] for item in center_list]
    
    #reverse_list = [ print(item) for item in center_list]
    cluster_centers["kmeans"+str(i)] = clist
    
    
#for every k-# of a point there is a label(code) which indicates the cluster the point is in
#the code is used for visualization in D3 
df = pd.DataFrame(data = cleanedData, columns=["lat","lon","kmeans2","kmeans3","kmeans4","kmeans5","kmeans6"])

#export to json
df[::40].reset_index().to_json("cluster.json",orient='records')

print cluster_centers
#print(json.dumps(cluster_centers))









    



k-2
k-3
k-4
k-5
k-6
{'kmeans6': [[-0.056069828823752554, 51.40334877748558], [-0.11272951467098788, 51.56242165052308], [-0.3554630790209041, 51.50983152054485], [-0.16985632915537652, 51.48408794482532], [0.1332605412195496, 51.51602156175918], [-0.01522703888203801, 51.529517316731244]], 'kmeans4': [[-0.3338654314747769, 51.517091639694655], [0.0894639642360957, 51.51366938493721], [-0.10814862420794083, 51.53900849303768], [-0.13019760646462827, 51.430245134008835]], 'kmeans5': [[0.10306148210516339, 51.50583788619866], [-0.09041817046309002, 51.538700185839446], [-0.4017705175862271, 51.51003635133509], [-0.2327686189695014, 51.506221785133704], [-0.0961201260647206, 51.409385222061054]], 'kmeans2': [[-0.21196502847330773, 51.50016899082028], [0.012658447132919583, 51.515914969265054]], 'kmeans3': [[-0.09920349744736034, 51.50938925989318], [-0.31281720306217314, 51.496047862541744], [0.09297534844983701, 51.51086459586324]]}



In [ ]:



In [ ]:



In [14]:

    
#Training vars


#splitting data into learning and testing sets

data = cleanedData[["lat","lon","DayOfWeek"]]
dataLabels = cleanedData["fireFlag"]

dataSize = len(data)
learnSizeKoef = np.int_(0.8 * dataSize)


knnLearnData = data[:learnSizeKoef].values.tolist()
knnLearnLabels = dataLabels[:learnSizeKoef].values.tolist()

knnTestData = data[learnSizeKoef:dataSize].values.tolist()
knnTestLabels = dataLabels[learnSizeKoef:dataSize].values.tolist()

#clean double brackets
knnLearnLabels = np.ravel(knnLearnLabels)
knnTestLabels = np.ravel(knnTestLabels)



In [15]:

    
#KNN machine learning part

from sklearn import datasets
from sklearn import metrics

#Training parameters
X = knnLearnData
#labels
y = knnLearnLabels
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(X, y) 

#KNN test predictions
#Mean accuracy (prumerna presnost)
print neigh.score(knnTestData,knnTestLabels)

expected = knnTestLabels
predicted = neigh.predict(knnTestData)

# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))









    



0.747603833866
             precision    recall  f1-score   support

          0       0.78      0.93      0.85     24639
          1       0.45      0.18      0.26      7913

avg / total       0.70      0.75      0.70     32552

[[22907  1732]
 [ 6484  1429]]
0.747603833866



In [52]:

    
## Lets predict the place of next fire depend on time and day in a week
#still working on

origData = cleanedData[["HourOfCall","DayOfWeek","lat","lon"]]
origDataLabel = cleanedData["fireFlag"]

#Training vars
dataSize = len(origData)
learnSizeKoef = np.int_(0.8 * dataSize)

#splitting data into learning and testing sets
knnLearnData = origData[:learnSizeKoef].values.tolist()
knnLearnLabels = origDataLabel[:learnSizeKoef].values.tolist()


knnTestData = origData[learnSizeKoef:dataSize].values.tolist()
knnTestLabels = origDataLabel[learnSizeKoef:dataSize].values.tolist()

#clean double brackets
#knnLearnLabels = np.ravel(knnLearnLabels)
knnTestLabels = np.ravel(knnTestLabels)


print len(knnLearnLabels)
print len(knnLearnData)
knnLearnLabels[:10]









    



130205
130205






    Out[52]:





['0', '0', '0', '1', '0', '1', '0', '0', '0', '1']



In [ ]:

	IncidentNumber	DateOfCall	TimeOfCall	IncidentGroup	StopCodeDescription	SpecialServiceType	PropertyCategory	PropertyType	AddressQualifier	Postcode_full	...	Easting_rounded	Northing_rounded	FRS	IncidentStationGround	FirstPumpArriving_AttendanceTime	FirstPumpArriving_DeployedFromStation	SecondPumpArriving_AttendanceTime	SecondPumpArriving_DeployedFromStation	NumStationsWithPumpsAttending	NumPumpsAttending
0	1131	1.I.13	0:02:06	False Alarm	AFA	NaN	Other Residential	Boarding House/B&B for homeless/asylum seekers	Correct incident address	NW6 1PG	...	525450	184850	London	West Hampstead	167	West Hampstead	NaN	NaN	1	1
1	4131	1.I.13	0:02:09	False Alarm	AFA	NaN	Non Residential	Single shop	Correct incident address	UB6 0HY	...	515450	185450	London	Northolt	236	Northolt	NaN	NaN	1	1

	Easting_m	Northing_m	IncidentGroup	TimeOfCall	DateOfCall	FirstPumpArriving_AttendanceTime	lat	lon	fireFlag	DayOfWeek
0	525424	184894	False Alarm	0:02:06	1.I.13	167	51.549006	-0.192508	0	1
1	515405	185445	False Alarm	0:02:09	1.I.13	236	51.556094	-0.336755	0	1
2	522456	178647	False Alarm	0:02:54	1.I.13	218	51.493515	-0.237460	0	1
3	527814	181016	Fire	0:03:02	1.I.13	426	51.513620	-0.159464	1	1
4	533338	180736	False Alarm	0:03:03	1.I.13	346	51.509828	-0.080011	0	1
5	520789	186645	Fire	0:05:16	1.I.13	323	51.565753	-0.258720	1	1
6	525045	184031	False Alarm	0:05:24	1.I.13	448	51.541334	-0.198278	0	1
9	508652	176984	False Alarm	0:16:39	1.I.13	274	51.481386	-0.436728	0	1
15	532802	181138	False Alarm	0:38:07	1.I.13	235	51.513567	-0.087579	0	1
16	534273	165917	Fire	0:50:18	1.I.13	287	51.376437	-0.072162	1	1