In [1]:
import csv 

import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import datetime
import pandas.io.data
%matplotlib inline


C:\Anaconda2\lib\site-packages\pandas\io\data.py:33: FutureWarning: 
The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.
  FutureWarning)

In [2]:
#Loads the data
dateparse = lambda x: pd.datetime.strptime(x, '%m/%d/%Y %H:%M')

df = pd.read_csv("C:\Users\Thinkpad\Documents\DTU\Python\week 3\SFCrime_from_1_Jan_2003.csv", 
                 parse_dates={'datetime': ['Date', 'Time']}, date_parser=dateparse)

In [3]:
df.columns


Out[3]:
Index([u'datetime', u'IncidntNum', u'Category', u'Descript', u'DayOfWeek',
       u'PdDistrict', u'Resolution', u'Address', u'X', u'Y', u'Location',
       u'PdId'],
      dtype='object')

In [4]:
cc_df = DataFrame(df,columns = ['Category','X','Y'])

cc_df.head()


Out[4]:
Category X Y
0 OTHER OFFENSES -122.413791 37.783837
1 DRUG/NARCOTIC -122.413791 37.783837
2 WARRANTS -122.413791 37.783837
3 NON-CRIMINAL -122.401206 37.760355
4 SUSPICIOUS OCC -122.411615 37.783161

In [5]:
# and now we sort out all other crimes than Prostitution
cc_df = cc_df[cc_df['Category'] == 'PROSTITUTION']

cc_df.head() # ~the index from old is still kept


Out[5]:
Category X Y
184 PROSTITUTION -122.427966 37.711823
185 PROSTITUTION -122.427966 37.711823
186 PROSTITUTION -122.427966 37.711823
769 PROSTITUTION -122.422063 37.789920
771 PROSTITUTION -122.422063 37.789920

In [6]:
# ~just for the visual I reset the index...
cc_df = cc_df.reset_index(drop=True)
# and rename X and Y to longtitude and lattitude respectively,
cc_df.rename(columns={'X': 'lon', 'Y': 'lat'}, inplace=True)

cc_df.head()


Out[6]:
Category lon lat
0 PROSTITUTION -122.427966 37.711823
1 PROSTITUTION -122.427966 37.711823
2 PROSTITUTION -122.427966 37.711823
3 PROSTITUTION -122.422063 37.789920
4 PROSTITUTION -122.422063 37.789920

In [7]:
# For plotting. Cant find the pip folder to install !             
# import geoplotlib
# from geoplotlib.utils import BoundingBox

In [8]:
"""
def plotCrime(crime):
    geo_data_for_plotting = crimes_dict[crime] #Since our lat/long is allready stored in correct format, we simpy insert
    
    #bbox defines the boundaries of the map used for plotting
    bbox = BoundingBox(north=max(crimes_dict[crime]['lat']), south=min(crimes_dict[crime]['lat']), 
                   west=min(crimes_dict[crime]['lon']), east=max(crimes_dict[crime]['lon'])) 
    
    print bbox 
    geoplotlib.set_bbox(bbox)
    geoplotlib.kde(geo_data_for_plotting, bw = 5, cut_below = 1e-4)
    geoplotlib.inline()
    
#Plots the focus crimes with reference to our focus crime num/string dict
plotCrime(focus_crimes[0])
"""


Out[8]:
"\ndef plotCrime(crime):\n    geo_data_for_plotting = crimes_dict[crime] #Since our lat/long is allready stored in correct format, we simpy insert\n    \n    #bbox defines the boundaries of the map used for plotting\n    bbox = BoundingBox(north=max(crimes_dict[crime]['lat']), south=min(crimes_dict[crime]['lat']), \n                   west=min(crimes_dict[crime]['lon']), east=max(crimes_dict[crime]['lon'])) \n    \n    print bbox \n    geoplotlib.set_bbox(bbox)\n    geoplotlib.kde(geo_data_for_plotting, bw = 5, cut_below = 1e-4)\n    geoplotlib.inline()\n    \n#Plots the focus crimes with reference to our focus crime num/string dict\nplotCrime(focus_crimes[0])\n"

In [9]:
#I will now plot my coordinates to inspect the data visually
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot") #easy style importing

import matplotlib.pylab #For resizing figure
matplotlib.pylab.rcParams['figure.figsize'] = (20.0, 10.0) #Change figure size, laaarge

X = DataFrame(cc_df,columns = ['lon','lat'])
X.plot(kind='scatter', x='lon', y='lat')


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x9bb84a8>

As seen above we have a clear outsider that lies way outside SF, probably a typo. Hence we sort this datapoint out,


In [10]:
X = X[X['lon'] < -122]
X.plot(kind='scatter', x='lon', y='lat')


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x209cbcf8>

The points now all seem to be within SF borders


In [11]:
from sklearn.cluster import KMeans

#To work with out cluster we have to turn our panda dataframe into a numpy array,
np_X = np.array(X)

kmeans = KMeans(n_clusters=2)
kmeans.fit(np_X)

centroid = kmeans.cluster_centers_
labels = kmeans.labels_

print "The %s cluster centers are located at %s " %(len(centroid),centroid)

colors = ["g.","r.","c."]

for i in range(len(np_X)):
   plt.plot(np_X[i][0],np_X[i][1],colors[labels[i]],markersize=10)

plt.scatter(centroid[:,0],centroid[:,1], marker = "x", s=150, linewidths = 5, zorder =10)

plt.show()


The 2 cluster centers are located at [[-122.4178032    37.78740516]
 [-122.41826714   37.7605898 ]] 

I will now look at the total squared error in relation to the number of clusters, to find the ideal knee bend,


In [12]:
from sklearn.cluster import KMeans

#To work with out cluster we have to turn our panda dataframe into a numpy array,
np_X = X

kmeans = KMeans(n_clusters=2)
kmeans.fit(np_X)

centroid = kmeans.cluster_centers_
classified_data = kmeans.labels_
labels = kmeans.labels_

print "The %s cluster centers are located at %s " %(len(centroid),centroid)

classified_data

#copy dataframe (may be memory intensive but just for illustration)
df_processed = X.copy()
df_processed['Cluster Class'] = pd.Series(classified_data, index=df_processed.index)


The 2 cluster centers are located at [[-122.41826714   37.7605898 ]
 [-122.4178032    37.78740516]] 

In [13]:
df_processed.head()


Out[13]:
lon lat Cluster Class
0 -122.427966 37.711823 0
1 -122.427966 37.711823 0
2 -122.427966 37.711823 0
3 -122.422063 37.789920 1
4 -122.422063 37.789920 1

In [14]:



Out[14]:
array([[-122.42796629,   37.71182294,    0.        ],
       [-122.42796629,   37.71182294,    0.        ],
       [-122.42796629,   37.71182294,    0.        ],
       ..., 
       [-122.41607529,   37.78444966,    1.        ],
       [-122.41607529,   37.78444966,    1.        ],
       [-122.41607529,   37.78444966,    1.        ]])

In [15]:


In [16]:
centroid_df = DataFrame(centroid)

centroid_df.head()


Out[16]:
0 1
0 -122.418267 37.760590
1 -122.417803 37.787405

In [17]:
df_processed.plot(kind='scatter', x='lon', y='lat',
                  c = 'Cluster Class', label='datapoints');



In [18]:
"""
import numpy 
import pandas
from  matplotlib import pyplot
import seaborn
seaborn.set(style='ticks')

numpy.random.seed(0)
N = 37
_genders= ['Female', 'Male', 'Non-binary', 'No Response']
df = pandas.DataFrame({
    'Height (cm)': numpy.random.uniform(low=130, high=200, size=N),
    'Weight (kg)': numpy.random.uniform(low=30, high=100, size=N),
    'Gender': numpy.random.choice(_genders, size=N)
})

fg = seaborn.FacetGrid(data=df, hue='Gender', hue_order=_genders, aspect=1.61)
fg.map(pyplot.scatter, 'Weight (kg)', 'Height (cm)').add_legend()

########################################

import seaborn
seaborn.set(style='ticks')

fg = seaborn.FacetGrid(data=df_processed, hue='Cluster Class', hue_order=_classes, aspect=1.61)
fg.map(pyplot.scatter, 'Lat', 'Lon').add_legend()

"""


Out[18]:
"\nimport numpy \nimport pandas\nfrom  matplotlib import pyplot\nimport seaborn\nseaborn.set(style='ticks')\n\nnumpy.random.seed(0)\nN = 37\n_genders= ['Female', 'Male', 'Non-binary', 'No Response']\ndf = pandas.DataFrame({\n    'Height (cm)': numpy.random.uniform(low=130, high=200, size=N),\n    'Weight (kg)': numpy.random.uniform(low=30, high=100, size=N),\n    'Gender': numpy.random.choice(_genders, size=N)\n})\n\nfg = seaborn.FacetGrid(data=df, hue='Gender', hue_order=_genders, aspect=1.61)\nfg.map(pyplot.scatter, 'Weight (kg)', 'Height (cm)').add_legend()\n\n########################################\n\nimport seaborn\nseaborn.set(style='ticks')\n\nfg = seaborn.FacetGrid(data=df_processed, hue='Cluster Class', hue_order=_classes, aspect=1.61)\nfg.map(pyplot.scatter, 'Lat', 'Lon').add_legend()\n\n"

In [19]:
from scipy.spatial import distance

def dist_euc(lon,lat,centroid):
    data_cord = [lon,lat]
    return distance.euclidean(data_cord,centroid)

df_processed['distance'] = df_processed.apply(lambda row: dist_euc(row['lon'], row['lat'],centroid[row['Cluster Class']]), axis=1)


C:\Anaconda2\lib\site-packages\ipykernel\__main__.py:7: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [52]:
df_processed.head()


Out[52]:
lon lat Cluster Class distance
0 -122.427966 37.711823 0 0.049722
1 -122.427966 37.711823 0 0.049722
2 -122.427966 37.711823 0 0.049722
3 -122.422063 37.789920 1 0.004947
4 -122.422063 37.789920 1 0.004947

In [21]:
ksum = []
def get_ksum(k):
    lonList = X['lon'].tolist()
    latList = X['lat'].tolist()
    
    for i in range(1,k):
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(X)
        centroid = kmeans.cluster_centers_
        labels = kmeans.labels_
        
        tmp_sum = 0
        for index, row in enumerate(lonList):
            tmp_sum += dist_euc(lonList[index], latList[index], centroid[labels[index]])
        ksum.append(tmp_sum)
    
get_ksum(10)

print ksum


[231.8306421230425, 84.8721400620188, 65.80886850676222, 62.49634777264666, 57.26662356823066, 55.4103228143857, 53.546599004145904, 44.456528622067616, 42.13499311285233]

In [22]:
#I Transform my data into a Dataframe to do easy and pretty plotting :-) 
ksum_df = DataFrame(ksum, index = range(1,10))

ksum_df.plot()


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x563f7898>

As seen the error drops dramaticly as we move from 1 to 2 clusters. It also drops rather significantly from 2-3, though not anything as much as the prior. The optimal solution would hence be either 2 or 3 clusters.


In [ ]:

CSV exporter for D3 data


In [ ]:
import csv
csv_file = df_processed[['lon','lat','Cluster Class']].values

csv_file

In [ ]:
with open('datapoints.csv','wb') as f:
    w = csv.writer(f)
    w.writerows(csv_file)

In [55]:
df_csv.head()


Out[55]:
lon lat
0 -122.427966 37.711823
1 -122.427966 37.711823
2 -122.427966 37.711823
3 -122.422063 37.789920
4 -122.422063 37.789920

In [76]:
df_csv = X.copy(deep = True)
centroid_list = []
for i in range(1,7):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(X)
    centroid = kmeans.cluster_centers_
    labels = kmeans.labels_
    column = "k%s" %i
    df_csv[column] = labels
    centroid_not_np = centroid.tolist()
    centroid_list.append(centroid_not_np)

In [83]:
df_csv.head()


Out[83]:
lon lat k1 k2 k3 k4 k5 k6
0 -122.427966 37.711823 0 1 2 3 4 4
1 -122.427966 37.711823 0 1 2 3 4 4
2 -122.427966 37.711823 0 1 2 3 4 4
3 -122.422063 37.789920 0 0 0 1 1 1
4 -122.422063 37.789920 0 0 0 1 1 1

In [85]:
centroid_list


Out[85]:
[[[-122.41803041909377, 37.77427215999907]],
 [[-122.41780320059732, 37.78740515602903],
  [-122.4182671423892, 37.76058979633723]],
 [[-122.41779062564557, 37.78760431239461],
  [-122.47831657799604, 37.74518815117978],
  [-122.41564847455233, 37.76155620220855]],
 [[-122.41562007872984, 37.76167688325117],
  [-122.41778584583219, 37.78760571940798],
  [-122.48642314613244, 37.75842178851072],
  [-122.45747184755328, 37.71946982646254]],
 [[-122.41582749287205, 37.761464941182155],
  [-122.41874287370916, 37.78766041272921],
  [-122.48642314613244, 37.75842178851072],
  [-122.40558553542785, 37.78501910504481],
  [-122.45747184755328, 37.71946982646254]],
 [[-122.41597809642941, 37.7617317356573],
  [-122.41874287370916, 37.78766041272921],
  [-122.46212539586402, 37.720497775068125],
  [-122.48647474290841, 37.758516393336016],
  [-122.4040437900258, 37.72773672304826],
  [-122.40558553542785, 37.78501910504481]]]

In [78]:
df_csv.to_csv('csv_clusters.csv', index=False)

In [89]:
with open('centroids.csv','wb') as csvfile:
    w = csv.writer(csvfile,quoting=csv.QUOTE_MINIMAL)
    w.writerows(centroid_list)

In [ ]:


In [ ]: