In [4]:
import pandas as pd
import json
import numpy as np

In [5]:
data = pd.read_csv('../python/data/lfb_jan2013-mar2016 original.csv',sep=",")
data[:2]


Out[5]:
IncidentNumber DateOfCall TimeOfCall IncidentGroup StopCodeDescription SpecialServiceType PropertyCategory PropertyType AddressQualifier Postcode_full ... Easting_rounded Northing_rounded FRS IncidentStationGround FirstPumpArriving_AttendanceTime FirstPumpArriving_DeployedFromStation SecondPumpArriving_AttendanceTime SecondPumpArriving_DeployedFromStation NumStationsWithPumpsAttending NumPumpsAttending
0 1131 1.I.13 0:02:06 False Alarm AFA NaN Other Residential Boarding House/B&B for homeless/asylum seekers Correct incident address NW6 1PG ... 525450 184850 London West Hampstead 167 West Hampstead NaN NaN 1 1
1 4131 1.I.13 0:02:09 False Alarm AFA NaN Non Residential Single shop Correct incident address UB6 0HY ... 515450 185450 London Northolt 236 Northolt NaN NaN 1 1

2 rows × 27 columns


In [6]:
##Formating data for machine learning

#no null values in easting and northing
cleanedData = data[data["Easting_m"].notnull() & data["FirstPumpArriving_AttendanceTime"].notnull()][["Easting_m","Northing_m","IncidentGroup","TimeOfCall","DateOfCall","FirstPumpArriving_AttendanceTime"]]

#convert easting northing to lat/lon
#might need to install BNG, -> pip install utm
import pyproj

bng = pyproj.Proj(init='epsg:27700')
wgs84 = pyproj.Proj(init='epsg:4326')

# AL1 1AB - pyproj.transform(from,to,easting,northing)
lon,lat = pyproj.transform(bng,wgs84, cleanedData["Easting_m"].values, cleanedData["Northing_m"].values)
cleanedData["lat"] = lat;
cleanedData["lon"] = lon;
#cleanedData

#adding hours of call to cleandedData
lst = cleanedData["TimeOfCall"].str.split(':').tolist()
timeCall = [np.int_(item[0]) for item in lst]
cleanedData["HourOfCall"] = timeCall

#converting IncidentGroup to either Fire = 1 or nonFire = 0
incidentDic = {"Fire": 1, "False Alarm" : 0, "Special Service" : 0, np.nan : 0}
cleanedData["fireFlag"] = [str(incidentDic[item]) for item in cleanedData["IncidentGroup"]]
#incidentDic[cleanedData["IncidentGroup"][0]]
#pd.isnan(cleanedData["IncidentGroup"])

from datetime import date
import roman
#Getting the day of week when the accident happend
#day.month.year is the format from csv
#monday = 0, sunday = 6
lst = cleanedData["DateOfCall"].str.split('.').tolist()
cleanedData["DayOfWeek"] = [date(2000+int(i[2]),roman.fromRoman(i[1]),int(i[0])).weekday() for i in lst]
cleanedData[:10]


Out[6]:
Easting_m Northing_m IncidentGroup TimeOfCall DateOfCall FirstPumpArriving_AttendanceTime lat lon HourOfCall fireFlag DayOfWeek
0 525424 184894 False Alarm 0:02:06 1.I.13 167 51.549006 -0.192508 0 0 1
1 515405 185445 False Alarm 0:02:09 1.I.13 236 51.556094 -0.336755 0 0 1
2 522456 178647 False Alarm 0:02:54 1.I.13 218 51.493515 -0.237460 0 0 1
3 527814 181016 Fire 0:03:02 1.I.13 426 51.513620 -0.159464 0 1 1
4 533338 180736 False Alarm 0:03:03 1.I.13 346 51.509828 -0.080011 0 0 1
5 520789 186645 Fire 0:05:16 1.I.13 323 51.565753 -0.258720 0 1 1
6 525045 184031 False Alarm 0:05:24 1.I.13 448 51.541334 -0.198278 0 0 1
9 508652 176984 False Alarm 0:16:39 1.I.13 274 51.481386 -0.436728 0 0 1
15 532802 181138 False Alarm 0:38:07 1.I.13 235 51.513567 -0.087579 0 0 1
16 534273 165917 Fire 0:50:18 1.I.13 287 51.376437 -0.072162 0 1 1

In [49]:
from sklearn.cluster import MiniBatchKMeans

#create new dataframe for output dataset
df = pd.DataFrame(data = cleanedData, columns=["lat","lon"])

input = df.values.tolist()
output = input

cluster_centers = {};

for i in range(2,7):
    print "k-" + str(i)
    mbk = MiniBatchKMeans(init='k-means++', n_clusters=i, batch_size=45,
                          n_init=10, max_no_improvement=10, verbose=0)
    mbk.fit(input)
    mbk_means_labels = mbk.labels_
    mbk_means_cluster_centers = mbk.cluster_centers_
    mbk_means_labels_unique = np.unique(mbk_means_labels)

    
    #print mbk_means_cluster_centers

    cleanedData["kmeans"+str(i)] = mbk_means_labels
    center_list = mbk_means_cluster_centers.tolist()
    
    clist = [item[::-1] for item in center_list]
    
    #reverse_list = [ print(item) for item in center_list]
    cluster_centers["kmeans"+str(i)] = clist
    
    
#for every k-# of a point there is a label(code) which indicates the cluster the point is in
#the code is used for visualization in D3 
df = pd.DataFrame(data = cleanedData, columns=["lat","lon","kmeans2","kmeans3","kmeans4","kmeans5","kmeans6"])

#export to json
df[::40].reset_index().to_json("cluster.json",orient='records')

print cluster_centers
#print(json.dumps(cluster_centers))


k-2
k-3
k-4
k-5
k-6
{'kmeans6': [[-0.056069828823752554, 51.40334877748558], [-0.11272951467098788, 51.56242165052308], [-0.3554630790209041, 51.50983152054485], [-0.16985632915537652, 51.48408794482532], [0.1332605412195496, 51.51602156175918], [-0.01522703888203801, 51.529517316731244]], 'kmeans4': [[-0.3338654314747769, 51.517091639694655], [0.0894639642360957, 51.51366938493721], [-0.10814862420794083, 51.53900849303768], [-0.13019760646462827, 51.430245134008835]], 'kmeans5': [[0.10306148210516339, 51.50583788619866], [-0.09041817046309002, 51.538700185839446], [-0.4017705175862271, 51.51003635133509], [-0.2327686189695014, 51.506221785133704], [-0.0961201260647206, 51.409385222061054]], 'kmeans2': [[-0.21196502847330773, 51.50016899082028], [0.012658447132919583, 51.515914969265054]], 'kmeans3': [[-0.09920349744736034, 51.50938925989318], [-0.31281720306217314, 51.496047862541744], [0.09297534844983701, 51.51086459586324]]}

In [ ]:


In [ ]:


In [14]:
#Training vars


#splitting data into learning and testing sets

data = cleanedData[["lat","lon","DayOfWeek"]]
dataLabels = cleanedData["fireFlag"]

dataSize = len(data)
learnSizeKoef = np.int_(0.8 * dataSize)


knnLearnData = data[:learnSizeKoef].values.tolist()
knnLearnLabels = dataLabels[:learnSizeKoef].values.tolist()

knnTestData = data[learnSizeKoef:dataSize].values.tolist()
knnTestLabels = dataLabels[learnSizeKoef:dataSize].values.tolist()

#clean double brackets
knnLearnLabels = np.ravel(knnLearnLabels)
knnTestLabels = np.ravel(knnTestLabels)

In [15]:
#KNN machine learning part

from sklearn import datasets
from sklearn import metrics

#Training parameters
X = knnLearnData
#labels
y = knnLearnLabels
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(X, y) 

#KNN test predictions
#Mean accuracy (prumerna presnost)
print neigh.score(knnTestData,knnTestLabels)

expected = knnTestLabels
predicted = neigh.predict(knnTestData)

# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))


0.747603833866
             precision    recall  f1-score   support

          0       0.78      0.93      0.85     24639
          1       0.45      0.18      0.26      7913

avg / total       0.70      0.75      0.70     32552

[[22907  1732]
 [ 6484  1429]]
0.747603833866

In [52]:
## Lets predict the place of next fire depend on time and day in a week
#still working on

origData = cleanedData[["HourOfCall","DayOfWeek","lat","lon"]]
origDataLabel = cleanedData["fireFlag"]

#Training vars
dataSize = len(origData)
learnSizeKoef = np.int_(0.8 * dataSize)

#splitting data into learning and testing sets
knnLearnData = origData[:learnSizeKoef].values.tolist()
knnLearnLabels = origDataLabel[:learnSizeKoef].values.tolist()


knnTestData = origData[learnSizeKoef:dataSize].values.tolist()
knnTestLabels = origDataLabel[learnSizeKoef:dataSize].values.tolist()

#clean double brackets
#knnLearnLabels = np.ravel(knnLearnLabels)
knnTestLabels = np.ravel(knnTestLabels)


print len(knnLearnLabels)
print len(knnLearnData)
knnLearnLabels[:10]


130205
130205
Out[52]:
['0', '0', '0', '1', '0', '1', '0', '0', '0', '1']

In [ ]: