In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join

In [2]:
data_example = pd.read_csv("data/2015-04/2015-04-city-of-london-street.csv")

In [3]:
data_example.head(5)


Out[3]:
Crime ID Month Reported by Falls within Longitude Latitude Location LSOA code LSOA name Crime type Last outcome category Context
0 9d28c69cc3695dc7972f8a990b8f2224d4aa10dcf87f8e... 2015-04 City of London Police City of London Police -0.111497 51.518226 On or near Pedestrian Subway E01000914 Camden 028B Other theft Investigation complete; no suspect identified NaN
1 b8571f69a113df635ce0911394273418481246a7536de8... 2015-04 City of London Police City of London Police -0.113767 51.517372 On or near Stone Buildings E01000914 Camden 028B Theft from the person Investigation complete; no suspect identified NaN
2 NaN 2015-04 City of London Police City of London Police -0.097736 51.520206 On or near Conference/Exhibition Centre E01000001 City of London 001A Anti-social behaviour NaN NaN
3 NaN 2015-04 City of London Police City of London Police -0.097601 51.520699 On or near Carthusian Street E01000001 City of London 001A Anti-social behaviour NaN NaN
4 05f9ae9d8f9c2d06457fa33060ceabb7fc6108d2ba8f37... 2015-04 City of London Police City of London Police -0.098572 51.516767 On or near King Edward Street E01000001 City of London 001A Bicycle theft Investigation complete; no suspect identified NaN

In [4]:
data_example["Crime type"].unique()


Out[4]:
array(['Other theft', 'Theft from the person', 'Anti-social behaviour',
       'Bicycle theft', 'Drugs', 'Public order', 'Shoplifting',
       'Violence and sexual offences', 'Other crime',
       'Criminal damage and arson', 'Burglary', 'Robbery', 'Vehicle crime',
       'Possession of weapons'], dtype=object)

In [5]:
data = pd.DataFrame()
dirs = listdir("data")

#listdir("data/{}".format(dirs[1]))
#join("data",dirs[1])
for i in dirs:
    if (i != ".DS_Store") and (i[:4] == "2015"):
        for l in listdir("data/{}".format(i)):
            if l != ".DS_Store":
                if l[-10:] == "street.csv":
                    tmp = pd.read_csv(join("data",i,l))
                    data = data.append(tmp)

In [6]:
data.shape


Out[6]:
(3827573, 12)

In [7]:
coord = data[["Longitude","Latitude"]]

In [8]:
from math import radians, cos, sin, asin, sqrt
london_geo = {"Latitude": 51.507, "Longitude": -0.127}
def haversine(coord):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    lon1=london_geo["Longitude"]
    lat1=london_geo["Latitude"]
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, coord["Longitude"], coord["Latitude"]])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6367 * c
    return km

In [9]:
new_coord = coord[coord["Latitude"].notnull()]

In [10]:
new_coord.insert(len(new_coord.columns), "Weight",1)
new_coord.head()


Out[10]:
Longitude Latitude Weight
0 -2.509930 51.410873 1
1 -2.512153 51.412941 1
2 -2.515816 51.408717 1
3 -2.511761 51.409966 1
4 -2.509126 51.416137 1

In [11]:
new_coord = new_coord.groupby(["Latitude", "Longitude"], as_index=False)["Weight"].sum()

In [12]:
new_coord.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 585740 entries, 0 to 585739
Data columns (total 3 columns):
Latitude     585740 non-null float64
Longitude    585740 non-null float64
Weight       585740 non-null int64
dtypes: float64(2), int64(1)
memory usage: 17.9 MB

In [13]:
new_coord.insert(len(new_coord.columns), "Distance", new_coord.apply(haversine, axis=1))

In [14]:
new_coord[new_coord["Distance"] <= 30].shape


Out[14]:
(68963, 4)

In [15]:
new_coord[new_coord["Distance"] <= 30].to_json("coord.json", orient="records")

In [ ]: