In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
In [2]:
data_example = pd.read_csv("data/2015-04/2015-04-city-of-london-street.csv")
In [3]:
data_example.head(5)
Out[3]:
In [4]:
data_example["Crime type"].unique()
Out[4]:
In [5]:
data = pd.DataFrame()
dirs = listdir("data")
#listdir("data/{}".format(dirs[1]))
#join("data",dirs[1])
for i in dirs:
if (i != ".DS_Store") and (i[:4] == "2015"):
for l in listdir("data/{}".format(i)):
if l != ".DS_Store":
if l[-10:] == "street.csv":
tmp = pd.read_csv(join("data",i,l))
data = data.append(tmp)
In [6]:
data.shape
Out[6]:
In [7]:
coord = data[["Longitude","Latitude"]]
In [8]:
from math import radians, cos, sin, asin, sqrt
london_geo = {"Latitude": 51.507, "Longitude": -0.127}
def haversine(coord):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
lon1=london_geo["Longitude"]
lat1=london_geo["Latitude"]
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, coord["Longitude"], coord["Latitude"]])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
km = 6367 * c
return km
In [9]:
new_coord = coord[coord["Latitude"].notnull()]
In [10]:
new_coord.insert(len(new_coord.columns), "Weight",1)
new_coord.head()
Out[10]:
In [11]:
new_coord = new_coord.groupby(["Latitude", "Longitude"], as_index=False)["Weight"].sum()
In [12]:
new_coord.info()
In [13]:
new_coord.insert(len(new_coord.columns), "Distance", new_coord.apply(haversine, axis=1))
In [14]:
new_coord[new_coord["Distance"] <= 30].shape
Out[14]:
In [15]:
new_coord[new_coord["Distance"] <= 30].to_json("coord.json", orient="records")
In [ ]: