In [1]:
import csv
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import datetime
import pandas.io.data
%matplotlib inline
In [2]:
#Loads the data
dateparse = lambda x: pd.datetime.strptime(x, '%m/%d/%Y %H:%M')
df = pd.read_csv("C:\Users\Thinkpad\Documents\DTU\Python\week 3\SFCrime_from_1_Jan_2003.csv",
parse_dates={'datetime': ['Date', 'Time']}, date_parser=dateparse)
In [3]:
df.columns
Out[3]:
In [4]:
cc_df = DataFrame(df,columns = ['Category','X','Y'])
cc_df.head()
Out[4]:
In [5]:
# and now we sort out all other crimes than Prostitution
cc_df = cc_df[cc_df['Category'] == 'PROSTITUTION']
cc_df.head() # ~the index from old is still kept
Out[5]:
In [6]:
# ~just for the visual I reset the index...
cc_df = cc_df.reset_index(drop=True)
# and rename X and Y to longtitude and lattitude respectively,
cc_df.rename(columns={'X': 'lon', 'Y': 'lat'}, inplace=True)
cc_df.head()
Out[6]:
In [7]:
# For plotting. Cant find the pip folder to install !
# import geoplotlib
# from geoplotlib.utils import BoundingBox
In [8]:
"""
def plotCrime(crime):
geo_data_for_plotting = crimes_dict[crime] #Since our lat/long is allready stored in correct format, we simpy insert
#bbox defines the boundaries of the map used for plotting
bbox = BoundingBox(north=max(crimes_dict[crime]['lat']), south=min(crimes_dict[crime]['lat']),
west=min(crimes_dict[crime]['lon']), east=max(crimes_dict[crime]['lon']))
print bbox
geoplotlib.set_bbox(bbox)
geoplotlib.kde(geo_data_for_plotting, bw = 5, cut_below = 1e-4)
geoplotlib.inline()
#Plots the focus crimes with reference to our focus crime num/string dict
plotCrime(focus_crimes[0])
"""
Out[8]:
In [9]:
#I will now plot my coordinates to inspect the data visually
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot") #easy style importing
import matplotlib.pylab #For resizing figure
matplotlib.pylab.rcParams['figure.figsize'] = (20.0, 10.0) #Change figure size, laaarge
X = DataFrame(cc_df,columns = ['lon','lat'])
X.plot(kind='scatter', x='lon', y='lat')
Out[9]:
As seen above we have a clear outsider that lies way outside SF, probably a typo. Hence we sort this datapoint out,
In [10]:
X = X[X['lon'] < -122]
X.plot(kind='scatter', x='lon', y='lat')
Out[10]:
The points now all seem to be within SF borders
In [11]:
from sklearn.cluster import KMeans
#To work with out cluster we have to turn our panda dataframe into a numpy array,
np_X = np.array(X)
kmeans = KMeans(n_clusters=2)
kmeans.fit(np_X)
centroid = kmeans.cluster_centers_
labels = kmeans.labels_
print "The %s cluster centers are located at %s " %(len(centroid),centroid)
colors = ["g.","r.","c."]
for i in range(len(np_X)):
plt.plot(np_X[i][0],np_X[i][1],colors[labels[i]],markersize=10)
plt.scatter(centroid[:,0],centroid[:,1], marker = "x", s=150, linewidths = 5, zorder =10)
plt.show()
I will now look at the total squared error in relation to the number of clusters, to find the ideal knee bend,
In [12]:
from sklearn.cluster import KMeans
#To work with out cluster we have to turn our panda dataframe into a numpy array,
np_X = X
kmeans = KMeans(n_clusters=2)
kmeans.fit(np_X)
centroid = kmeans.cluster_centers_
classified_data = kmeans.labels_
labels = kmeans.labels_
print "The %s cluster centers are located at %s " %(len(centroid),centroid)
classified_data
#copy dataframe (may be memory intensive but just for illustration)
df_processed = X.copy()
df_processed['Cluster Class'] = pd.Series(classified_data, index=df_processed.index)
In [13]:
df_processed.head()
Out[13]:
In [14]:
Out[14]:
In [15]:
In [16]:
centroid_df = DataFrame(centroid)
centroid_df.head()
Out[16]:
In [17]:
df_processed.plot(kind='scatter', x='lon', y='lat',
c = 'Cluster Class', label='datapoints');
In [18]:
"""
import numpy
import pandas
from matplotlib import pyplot
import seaborn
seaborn.set(style='ticks')
numpy.random.seed(0)
N = 37
_genders= ['Female', 'Male', 'Non-binary', 'No Response']
df = pandas.DataFrame({
'Height (cm)': numpy.random.uniform(low=130, high=200, size=N),
'Weight (kg)': numpy.random.uniform(low=30, high=100, size=N),
'Gender': numpy.random.choice(_genders, size=N)
})
fg = seaborn.FacetGrid(data=df, hue='Gender', hue_order=_genders, aspect=1.61)
fg.map(pyplot.scatter, 'Weight (kg)', 'Height (cm)').add_legend()
########################################
import seaborn
seaborn.set(style='ticks')
fg = seaborn.FacetGrid(data=df_processed, hue='Cluster Class', hue_order=_classes, aspect=1.61)
fg.map(pyplot.scatter, 'Lat', 'Lon').add_legend()
"""
Out[18]:
In [19]:
from scipy.spatial import distance
def dist_euc(lon,lat,centroid):
data_cord = [lon,lat]
return distance.euclidean(data_cord,centroid)
df_processed['distance'] = df_processed.apply(lambda row: dist_euc(row['lon'], row['lat'],centroid[row['Cluster Class']]), axis=1)
In [52]:
df_processed.head()
Out[52]:
In [21]:
ksum = []
def get_ksum(k):
lonList = X['lon'].tolist()
latList = X['lat'].tolist()
for i in range(1,k):
kmeans = KMeans(n_clusters=i)
kmeans.fit(X)
centroid = kmeans.cluster_centers_
labels = kmeans.labels_
tmp_sum = 0
for index, row in enumerate(lonList):
tmp_sum += dist_euc(lonList[index], latList[index], centroid[labels[index]])
ksum.append(tmp_sum)
get_ksum(10)
print ksum
In [22]:
#I Transform my data into a Dataframe to do easy and pretty plotting :-)
ksum_df = DataFrame(ksum, index = range(1,10))
ksum_df.plot()
Out[22]:
As seen the error drops dramaticly as we move from 1 to 2 clusters. It also drops rather significantly from 2-3, though not anything as much as the prior. The optimal solution would hence be either 2 or 3 clusters.
In [ ]:
In [ ]:
import csv
csv_file = df_processed[['lon','lat','Cluster Class']].values
csv_file
In [ ]:
with open('datapoints.csv','wb') as f:
w = csv.writer(f)
w.writerows(csv_file)
In [55]:
df_csv.head()
Out[55]:
In [76]:
df_csv = X.copy(deep = True)
centroid_list = []
for i in range(1,7):
kmeans = KMeans(n_clusters=i)
kmeans.fit(X)
centroid = kmeans.cluster_centers_
labels = kmeans.labels_
column = "k%s" %i
df_csv[column] = labels
centroid_not_np = centroid.tolist()
centroid_list.append(centroid_not_np)
In [83]:
df_csv.head()
Out[83]:
In [85]:
centroid_list
Out[85]:
In [78]:
df_csv.to_csv('csv_clusters.csv', index=False)
In [89]:
with open('centroids.csv','wb') as csvfile:
w = csv.writer(csvfile,quoting=csv.QUOTE_MINIMAL)
w.writerows(centroid_list)
In [ ]:
In [ ]: