In [4]:
import pandas as pd
import json
import numpy as np
In [5]:
data = pd.read_csv('../python/data/lfb_jan2013-mar2016 original.csv',sep=",")
data[:2]
Out[5]:
In [6]:
##Formating data for machine learning
#no null values in easting and northing
cleanedData = data[data["Easting_m"].notnull() & data["FirstPumpArriving_AttendanceTime"].notnull()][["Easting_m","Northing_m","IncidentGroup","TimeOfCall","DateOfCall","FirstPumpArriving_AttendanceTime"]]
#convert easting northing to lat/lon
#might need to install BNG, -> pip install utm
import pyproj
bng = pyproj.Proj(init='epsg:27700')
wgs84 = pyproj.Proj(init='epsg:4326')
# AL1 1AB - pyproj.transform(from,to,easting,northing)
lon,lat = pyproj.transform(bng,wgs84, cleanedData["Easting_m"].values, cleanedData["Northing_m"].values)
cleanedData["lat"] = lat;
cleanedData["lon"] = lon;
#cleanedData
#adding hours of call to cleandedData
lst = cleanedData["TimeOfCall"].str.split(':').tolist()
timeCall = [np.int_(item[0]) for item in lst]
cleanedData["HourOfCall"] = timeCall
#converting IncidentGroup to either Fire = 1 or nonFire = 0
incidentDic = {"Fire": 1, "False Alarm" : 0, "Special Service" : 0, np.nan : 0}
cleanedData["fireFlag"] = [str(incidentDic[item]) for item in cleanedData["IncidentGroup"]]
#incidentDic[cleanedData["IncidentGroup"][0]]
#pd.isnan(cleanedData["IncidentGroup"])
from datetime import date
import roman
#Getting the day of week when the accident happend
#day.month.year is the format from csv
#monday = 0, sunday = 6
lst = cleanedData["DateOfCall"].str.split('.').tolist()
cleanedData["DayOfWeek"] = [date(2000+int(i[2]),roman.fromRoman(i[1]),int(i[0])).weekday() for i in lst]
cleanedData[:10]
Out[6]:
In [49]:
from sklearn.cluster import MiniBatchKMeans
#create new dataframe for output dataset
df = pd.DataFrame(data = cleanedData, columns=["lat","lon"])
input = df.values.tolist()
output = input
cluster_centers = {};
for i in range(2,7):
print "k-" + str(i)
mbk = MiniBatchKMeans(init='k-means++', n_clusters=i, batch_size=45,
n_init=10, max_no_improvement=10, verbose=0)
mbk.fit(input)
mbk_means_labels = mbk.labels_
mbk_means_cluster_centers = mbk.cluster_centers_
mbk_means_labels_unique = np.unique(mbk_means_labels)
#print mbk_means_cluster_centers
cleanedData["kmeans"+str(i)] = mbk_means_labels
center_list = mbk_means_cluster_centers.tolist()
clist = [item[::-1] for item in center_list]
#reverse_list = [ print(item) for item in center_list]
cluster_centers["kmeans"+str(i)] = clist
#for every k-# of a point there is a label(code) which indicates the cluster the point is in
#the code is used for visualization in D3
df = pd.DataFrame(data = cleanedData, columns=["lat","lon","kmeans2","kmeans3","kmeans4","kmeans5","kmeans6"])
#export to json
df[::40].reset_index().to_json("cluster.json",orient='records')
print cluster_centers
#print(json.dumps(cluster_centers))
In [ ]:
In [ ]:
In [14]:
#Training vars
#splitting data into learning and testing sets
data = cleanedData[["lat","lon","DayOfWeek"]]
dataLabels = cleanedData["fireFlag"]
dataSize = len(data)
learnSizeKoef = np.int_(0.8 * dataSize)
knnLearnData = data[:learnSizeKoef].values.tolist()
knnLearnLabels = dataLabels[:learnSizeKoef].values.tolist()
knnTestData = data[learnSizeKoef:dataSize].values.tolist()
knnTestLabels = dataLabels[learnSizeKoef:dataSize].values.tolist()
#clean double brackets
knnLearnLabels = np.ravel(knnLearnLabels)
knnTestLabels = np.ravel(knnTestLabels)
In [15]:
#KNN machine learning part
from sklearn import datasets
from sklearn import metrics
#Training parameters
X = knnLearnData
#labels
y = knnLearnLabels
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(X, y)
#KNN test predictions
#Mean accuracy (prumerna presnost)
print neigh.score(knnTestData,knnTestLabels)
expected = knnTestLabels
predicted = neigh.predict(knnTestData)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
In [52]:
## Lets predict the place of next fire depend on time and day in a week
#still working on
origData = cleanedData[["HourOfCall","DayOfWeek","lat","lon"]]
origDataLabel = cleanedData["fireFlag"]
#Training vars
dataSize = len(origData)
learnSizeKoef = np.int_(0.8 * dataSize)
#splitting data into learning and testing sets
knnLearnData = origData[:learnSizeKoef].values.tolist()
knnLearnLabels = origDataLabel[:learnSizeKoef].values.tolist()
knnTestData = origData[learnSizeKoef:dataSize].values.tolist()
knnTestLabels = origDataLabel[learnSizeKoef:dataSize].values.tolist()
#clean double brackets
#knnLearnLabels = np.ravel(knnLearnLabels)
knnTestLabels = np.ravel(knnTestLabels)
print len(knnLearnLabels)
print len(knnLearnData)
knnLearnLabels[:10]
Out[52]:
In [ ]: