In [2]:
import os
# !pip install seaborn
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# !pip install mpld3
import mpld3
import json
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 10)
%matplotlib inline
mpld3.enable_notebook()
In [3]:
# df = pd.read_csv('../shared-resources/crimedata.csv')
df = pd.DataFrame.from_csv('../shared-resources/crimedata.csv', index_col=0)
df.tail()
Out[3]:
In [4]:
df = pd.DataFrame.from_csv('../shared-resources/crimedata.csv', index_col=2)
df.tail()
Out[4]:
In [5]:
# Now we can retrieve all the crimes on a particular date just by using the `.loc` getitem method (square brackets)
df.loc['2014-06-16']
Out[5]:
In [6]:
# What are the types and sizes of each of our columns?
df.info()
In [83]:
# that's a lot of data, let's take a sample so our plots don't take too long to display
df = df.sample(8000)
df.describe()
Out[83]:
In [7]:
df.police_district.astype(int)
In [8]:
# Can we mask out everything except crimes in this nonnumbered police district called "OP"
mask = (df.police_district == 'OP')
df[mask].head()
Out[8]:
In [9]:
# let's clean up that major_offense_type string to make it consistently spelled and capitalized
df.major_offense_type = df.major_offense_type.str.lower().str.strip()
# now let's count them up
df.major_offense_type.value_counts()
Out[9]:
In [18]:
print('|'.join(sorted(df.major_offense_type.value_counts().index)))
In [20]:
# extract only rows for tresspass crimes
mask = df.major_offense_type == 'trespass'
trespasses = df[mask]
df[mask].describe()
Out[20]:
Looks like gambling is not a popular crime
(or at least not a popular police reported crime)
In [19]:
# Now let's count up crimes by police district
# Notice any non-integer values?
district_counts = trespasses.police_district.value_counts()
district_counts
Out[19]:
In [21]:
district_counts
Out[21]:
In [22]:
# let's create a dictionary of all the crime counts in all the districts
dict(zip(district_counts.index, district_counts))
Out[22]:
In [23]:
# Let's talk about dict and zip to make sure you understand the code above
dict([('a', 0), ('b', 1), ('c', 2), ('d', 3), ('e', 4)])
Out[23]:
In [24]:
# dict will accept any sequences of pairings (2-tuples)
# but what if we have two sequences of values that we want to "pair up"
# zip brings the two sides of a pair (the sides of a zipper) together in an alligned pairing
list(zip(['a', 'b', 'c', 'd', 'e'], [0, 1, 2, 3, 4]))
Out[24]:
In [25]:
# and a dict is can take a list of pairs, like before... so
dict(zip(['a', 'b', 'c', 'd', 'e'], [0, 1, 2, 3, 4]))
Out[25]:
In [26]:
# and `range(5)` is a sequence (iterable)
dict(zip(['a', 'b', 'c', 'd', 'e'], range(5)))
Out[26]:
In [27]:
# if we get the length wrong, the zipper will get "stuck" on the shortest sequence
dict(zip(['a', 'b', 'c', 'd', 'e'], range(4)))
Out[27]:
In [28]:
# or because any sequence will do, and a str is a sequence of characters:
dict(zip('abcde', range(5)))
Out[28]:
In [29]:
# What will happen if you try to coerce a list of 3-tuples into a dict?
dict([('a', 1, 2), ('b', 3, 4)])
In [30]:
x = list(range(3))
y = list(range(4))
print(x)
print(y)
list(zip(x, y, range(5)))
Out[30]:
In [31]:
x = [(0, 0, 0), (1, 1, 1), (2, 2, 2)]
list(zip(*x))
Out[31]:
In [32]:
list(zip(x[0], x[1], x[2]))
Out[32]:
In [34]:
df['report_date'] = pd.to_datetime(df.index)
df.report_date
Out[34]:
In [35]:
def fun(x):
return str(x)
df.report_date.apply(fun)
# df.
Out[35]:
In [38]:
# df.plot.scatter(x='xcoordinate', y='ycoordinate', c='r')
df.plot(kind='scatter', x='xcoordinate', y='ycoordinate', c='r')
from matplotlib import pyplot as plt
plt.show()
In [39]:
df.std()
Out[39]:
In [40]:
trespasses.plot(kind='scatter', x='xcoordinate', y='ycoordinate')
plt.show()