In [3]:
import pandas as pd

In [4]:
!head -n 1000 violations.csv > small-violations.csv

1. I want to make sure my Plate ID is a string. Can't lose the leading zeroes!


In [5]:
plate_info = {'Plate ID': 'str'}
df = pd.read_csv("small-violations.csv", dtype=plate_info)

In [105]:
df


Out[105]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Unregistered Vehicle? Vehicle Year Meter Number Feet From Curb Violation Post Code Violation Description No Standing or Stopping Violation Hydrant Violation Double Parking Violation New Issue Date
0 1283294138 GBB9093 NY PAS 08/04/2013 46 SUBN AUDI P 37250 ... 0 2013.0 - 0 NaN NaN NaN NaN NaN 2013-08-04
1 1283294151 62416MB NY COM 08/04/2013 46 VAN FORD P 37290 ... 0 2012.0 - 0 NaN NaN NaN NaN NaN 2013-08-04
2 1283294163 78755JZ NY COM 08/05/2013 46 P-U CHEVR P 37030 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-05
3 1283294175 63009MA NY COM 08/05/2013 46 VAN FORD P 37270 ... 0 2010.0 - 0 NaN NaN NaN NaN NaN 2013-08-05
4 1283294187 91648MC NY COM 08/08/2013 41 TRLR GMC P 37240 ... 0 2012.0 - 0 NaN NaN NaN NaN NaN 2013-08-08
5 1283294217 T60DAR NJ PAS 08/11/2013 14 P-U DODGE P 37250 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-11
6 1283294229 GCR2838 NY PAS 08/11/2013 14 VAN NaN P 37250 ... 0 2011.0 - 0 NaN NaN NaN NaN NaN 2013-08-11
7 1283983620 XZ764G NJ PAS 08/07/2013 24 DELV FORD X 63430 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-07
8 1283983631 GBH9379 NY PAS 08/07/2013 24 SDN TOYOT X 63430 ... 0 2001.0 - 0 NaN NaN NaN NaN NaN 2013-08-07
9 1283983667 MCL78B NJ PAS 07/18/2013 24 SDN SUBAR H 0 ... 0 2005.0 - 0 NaN NaN NaN NaN NaN 2013-07-18
10 1283983679 M367CN NY PAS 07/18/2013 24 SDN HYUND H 0 ... 0 2010.0 - 0 NaN NaN NaN NaN NaN 2013-07-18
11 1283983734 GAR6813 NY PAS 07/18/2013 24 SDN TOYOT H 0 ... 0 1998.0 - 0 NaN NaN NaN NaN NaN 2013-07-18
12 1283983771 GEN8674 NY PAS 07/31/2013 24 SDN AUDI X 0 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-07-31
13 1283983825 GAC2703 NY PAS 08/12/2013 24 SDN NISSA X 23230 ... 0 2012.0 - 0 NaN NaN NaN NaN NaN 2013-08-12
14 1286036800 40793JY NY COM 07/05/2013 14 VAN CHEVR P 34190 ... 0 1995.0 - 0 NaN NaN NaN NaN NaN 2013-07-05
15 1286123550 GAD1485 NY PAS 08/12/2013 20 SDN VOLKS T 28930 ... 0 2012.0 - 0 NaN NaN NaN NaN NaN 2013-08-12
16 1286246398 GFC5338 NY PAS 07/26/2013 14 SDN TOYOT T 0 ... 0 2010.0 - 0 NaN NaN NaN NaN NaN 2013-07-26
17 1286246416 815M342 MD PAS 07/30/2013 20 SUBN SATUR T 0 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-07-30
18 1286248000 GJA3452 NY PAS 07/23/2013 14 SDN KIA T 73690 ... 0 2013.0 - 0 NaN NaN NaN NaN NaN 2013-07-23
19 1286282330 YZY6476 NC PAS 07/29/2013 20 SDN NISSA T 32030 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-07-29
20 1286282342 WBJ819 LA PAS 08/07/2013 17 SUBN HONDA T 0 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-07
21 1286289841 GAV9235 NY PAS 07/20/2013 50 SDN HONDA T 0 ... 0 2012.0 - 0 NaN NaN NaN NaN NaN 2013-07-20
22 1286654920 ZTR66R NJ PAS 07/18/2013 50 SDN N/S T 0 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-07-18
23 1286799648 GDE3973 NY PAS 07/20/2013 40 SDN TOYOT T 0 ... 0 2003.0 - 9 NaN NaN NaN NaN NaN 2013-07-20
24 1286807062 P193871 IL PAS 08/10/2013 40 SDN SCIO T 26630 ... 0 NaN - 6 NaN NaN NaN NaN NaN 2013-08-10
25 1286807475 GGL6608 NY PAS 07/26/2013 40 SDN NISSA T 73650 ... 0 2013.0 - 1 NaN NaN NaN NaN NaN 2013-07-26
26 1286807633 84301JU NY COM 07/19/2013 14 P-U DODGE X 63030 ... 0 2001.0 - 0 NaN NaN NaN NaN NaN 2013-07-19
27 1286808807 HHG3545 PA PAS 08/06/2013 20 SDN TOYOT T 21130 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-06
28 1287320491 50097JX NY COM 08/07/2013 40 VAN MERCU T 23190 ... 0 2003.0 - 0 NaN NaN NaN NaN NaN 2013-08-07
29 1287320570 GEJ9154 NY PAS 08/06/2013 51 VAN TOYOT X 93230 ... 0 2013.0 - 0 NaN NaN NaN NaN NaN 2013-08-06
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
969 1334886817 T600324C NY SRF 07/12/2013 17 VAN FORD X 37290 ... 0 2010.0 - 0 NaN NaN NaN NaN NaN 2013-07-12
970 1334893391 GEF8039 NY PAS 08/08/2013 14 SDN HONDA X 0 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-08
971 1334899332 GAR5973 NY PAS 05/28/2013 20 SUBN FORD H 24240 ... 0 2004.0 - 0 NaN NaN NaN NaN NaN 2013-05-28
972 1334899356 76860 NY MED 07/11/2013 20 SUBN TOYOT H 24240 ... 0 2005.0 - 0 NaN NaN NaN NaN NaN 2013-07-11
973 1334900190 G44DBB NJ PAS 06/06/2013 20 SDN FORD H 24240 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-06-06
974 1334960409 GCP7978 NY PAS 07/16/2013 20 SDN ME/BE P 31820 ... 0 2011.0 - 0 NaN NaN NaN NaN NaN 2013-07-16
975 1334965547 T606011C NY SRF 08/18/2013 40 SDN LINCO P 68720 ... 0 2005.0 - 0 NaN NaN NaN NaN NaN 2013-08-18
976 1335085889 GEB4077 NY PAS 08/05/2013 24 SUBN BMW P 58730 ... 0 2001.0 - 0 NaN NaN NaN NaN NaN 2013-08-05
977 1335086067 408861 RI PAS 08/04/2013 24 SDN MAZDA P 10810 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-04
978 1335087308 505GSB OK PAS 08/21/2013 20 SDN NISSA P 63030 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-21
979 1335087436 GGF7010 NY PAS 08/14/2013 24 SUBN JEEP P 69230 ... 0 1995.0 - 0 NaN NaN NaN NaN NaN 2013-08-14
980 1335087618 XH208F 99 COM 07/27/2013 20 VAN FORD P 69230 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-07-27
981 1335087930 GGB3579 NY PAS 07/27/2013 41 SDN HYUND P 73110 ... 0 2013.0 - 0 NaN NaN NaN NaN NaN 2013-07-27
982 1335089433 49SD67 NY MOT 08/05/2013 51 MC YAMAH P 48830 ... 0 2005.0 - 0 NaN NaN NaN NaN NaN 2013-08-05
983 1335089445 GDM5570 NY PAS 08/20/2013 20 SUBN FORD P 95030 ... 0 2004.0 - 0 NaN NaN NaN NaN NaN 2013-08-20
984 1335089731 55RG48 NY MOT 08/17/2013 51 MOTO HONDA P 28430 ... 0 2003.0 - 0 NaN NaN NaN NaN NaN 2013-08-17
985 1335090538 GDF7589 NY PAS 07/21/2013 24 SUBN BMW P 0 ... 0 2005.0 - 0 NaN NaN NaN NaN NaN 2013-07-21
986 1335091440 PC613253 99 PAS 07/27/2013 24 SUBN CHEVR P 39930 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-07-27
987 1335092730 82GN94 NY MOT 08/17/2013 51 MOTO KAWAS P 34630 ... 0 2008.0 - 0 NaN NaN NaN NaN NaN 2013-08-17
988 1335092742 47RE79 NY MOT 08/17/2013 51 MOTO KAWAS P 34630 ... 0 2008.0 - 0 NaN NaN NaN NaN NaN 2013-08-17
989 1335092754 78GX62 NY MOT 08/17/2013 51 MOTO KAWAS P 34630 ... 0 2009.0 - 0 NaN NaN NaN NaN NaN 2013-08-17
990 1335092766 49SD33 NY MOT 08/20/2013 51 MOTO KAWAS P 23930 ... 0 2001.0 - 0 NaN NaN NaN NaN NaN 2013-08-20
991 1335092778 GDU8731 NY PAS 08/21/2013 24 SDN INFIN P 0 ... 0 2013.0 - 0 NaN NaN NaN NaN NaN 2013-08-21
992 1335093072 GEN7535 NY PAS 07/23/2013 24 SDN NISSA P 56230 ... 0 1997.0 - 0 NaN NaN NaN NaN NaN 2013-07-23
993 1335093199 T529952C NY SRF 08/01/2013 50 SDN LINCO P 0 ... 0 2006.0 - 0 NaN NaN NaN NaN NaN 2013-08-01
994 1335093205 GBJ4073 NY PAS 08/01/2013 14 SUBN FORD P 56230 ... 0 2004.0 - 0 NaN NaN NaN NaN NaN 2013-08-01
995 1335095342 JHA1050 PA PAS 07/21/2013 40 SUBN GMC P 19830 ... 0 NaN - 4 NaN NaN NaN NaN NaN 2013-07-21
996 1335096139 72416R VA PAS 08/11/2013 24 SUBN ME/BE P 34330 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-11
997 1335096140 HPJ538 MS PAS 08/11/2013 24 SDN HONDA P 34330 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-11
998 1335096152 HYW9707 PA PAS 08/19/2013 24 SUBN VOLKS P 34330 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-19

999 rows × 44 columns


In [6]:
df.head()


Out[6]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Vehicle Color Unregistered Vehicle? Vehicle Year Meter Number Feet From Curb Violation Post Code Violation Description No Standing or Stopping Violation Hydrant Violation Double Parking Violation
0 1283294138 GBB9093 NY PAS 08/04/2013 46 SUBN AUDI P 37250 ... GY 0 2013 - 0 NaN NaN NaN NaN NaN
1 1283294151 62416MB NY COM 08/04/2013 46 VAN FORD P 37290 ... WH 0 2012 - 0 NaN NaN NaN NaN NaN
2 1283294163 78755JZ NY COM 08/05/2013 46 P-U CHEVR P 37030 ... NaN 0 0 - 0 NaN NaN NaN NaN NaN
3 1283294175 63009MA NY COM 08/05/2013 46 VAN FORD P 37270 ... WH 0 2010 - 0 NaN NaN NaN NaN NaN
4 1283294187 91648MC NY COM 08/08/2013 41 TRLR GMC P 37240 ... BR 0 2012 - 0 NaN NaN NaN NaN NaN

5 rows × 43 columns


In [107]:
df.head(10)


Out[107]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Unregistered Vehicle? Vehicle Year Meter Number Feet From Curb Violation Post Code Violation Description No Standing or Stopping Violation Hydrant Violation Double Parking Violation New Issue Date
0 1283294138 GBB9093 NY PAS 08/04/2013 46 SUBN AUDI P 37250 ... 0 2013.0 - 0 NaN NaN NaN NaN NaN 2013-08-04
1 1283294151 62416MB NY COM 08/04/2013 46 VAN FORD P 37290 ... 0 2012.0 - 0 NaN NaN NaN NaN NaN 2013-08-04
2 1283294163 78755JZ NY COM 08/05/2013 46 P-U CHEVR P 37030 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-05
3 1283294175 63009MA NY COM 08/05/2013 46 VAN FORD P 37270 ... 0 2010.0 - 0 NaN NaN NaN NaN NaN 2013-08-05
4 1283294187 91648MC NY COM 08/08/2013 41 TRLR GMC P 37240 ... 0 2012.0 - 0 NaN NaN NaN NaN NaN 2013-08-08
5 1283294217 T60DAR NJ PAS 08/11/2013 14 P-U DODGE P 37250 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-11
6 1283294229 GCR2838 NY PAS 08/11/2013 14 VAN NaN P 37250 ... 0 2011.0 - 0 NaN NaN NaN NaN NaN 2013-08-11
7 1283983620 XZ764G NJ PAS 08/07/2013 24 DELV FORD X 63430 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-07
8 1283983631 GBH9379 NY PAS 08/07/2013 24 SDN TOYOT X 63430 ... 0 2001.0 - 0 NaN NaN NaN NaN NaN 2013-08-07
9 1283983667 MCL78B NJ PAS 07/18/2013 24 SDN SUBAR H 0 ... 0 2005.0 - 0 NaN NaN NaN NaN NaN 2013-07-18

10 rows × 44 columns


In [106]:
df.tail()


Out[106]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Unregistered Vehicle? Vehicle Year Meter Number Feet From Curb Violation Post Code Violation Description No Standing or Stopping Violation Hydrant Violation Double Parking Violation New Issue Date
994 1335093205 GBJ4073 NY PAS 08/01/2013 14 SUBN FORD P 56230 ... 0 2004.0 - 0 NaN NaN NaN NaN NaN 2013-08-01
995 1335095342 JHA1050 PA PAS 07/21/2013 40 SUBN GMC P 19830 ... 0 NaN - 4 NaN NaN NaN NaN NaN 2013-07-21
996 1335096139 72416R VA PAS 08/11/2013 24 SUBN ME/BE P 34330 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-11
997 1335096140 HPJ538 MS PAS 08/11/2013 24 SDN HONDA P 34330 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-11
998 1335096152 HYW9707 PA PAS 08/19/2013 24 SUBN VOLKS P 34330 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-19

5 rows × 44 columns

2. I don't think anyone's car was built in 0AD. Discard the '0's as NaN.


In [7]:
plate_info = {'Plate ID': 'str'}
df = pd.read_csv("small-violations.csv", dtype=plate_info, na_values={'Vehicle Year': '0', 'Date First Observed': '0'})
df.head()


Out[7]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Vehicle Color Unregistered Vehicle? Vehicle Year Meter Number Feet From Curb Violation Post Code Violation Description No Standing or Stopping Violation Hydrant Violation Double Parking Violation
0 1283294138 GBB9093 NY PAS 08/04/2013 46 SUBN AUDI P 37250 ... GY 0 2013.0 - 0 NaN NaN NaN NaN NaN
1 1283294151 62416MB NY COM 08/04/2013 46 VAN FORD P 37290 ... WH 0 2012.0 - 0 NaN NaN NaN NaN NaN
2 1283294163 78755JZ NY COM 08/05/2013 46 P-U CHEVR P 37030 ... NaN 0 NaN - 0 NaN NaN NaN NaN NaN
3 1283294175 63009MA NY COM 08/05/2013 46 VAN FORD P 37270 ... WH 0 2010.0 - 0 NaN NaN NaN NaN NaN
4 1283294187 91648MC NY COM 08/08/2013 41 TRLR GMC P 37240 ... BR 0 2012.0 - 0 NaN NaN NaN NaN NaN

5 rows × 43 columns

3. I want the dates to be dates! Read the read_csv documentation to find out how to make pandas automatically parse dates.


In [19]:
import dateutil
def date_to_date(date):
        date = str(date)
        parsed_date = dateutil.parser.parse(date)
        return parsed_date

In [20]:
df.columns


Out[20]:
Index(['Summons Number', 'Plate ID', 'Registration State', 'Plate Type',
       'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make',
       'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3',
       'Vehicle Expiration Date', 'Violation Location', 'Violation Precinct',
       'Issuer Precinct', 'Issuer Code', 'Issuer Command', 'Issuer Squad',
       'Violation Time', 'Time First Observed', 'Violation County',
       'Violation In Front Of Or Opposite', 'House Number', 'Street Name',
       'Intersecting Street', 'Date First Observed', 'Law Section',
       'Sub Division', 'Violation Legal Code', 'Days Parking In Effect    ',
       'From Hours In Effect', 'To Hours In Effect', 'Vehicle Color',
       'Unregistered Vehicle?', 'Vehicle Year', 'Meter Number',
       'Feet From Curb', 'Violation Post Code', 'Violation Description',
       'No Standing or Stopping Violation', 'Hydrant Violation',
       'Double Parking Violation'],
      dtype='object')

In [21]:
df['New Issue Date']= df['Issue Date'].apply(date_to_date)

In [22]:
import datetime
def convert_to_time(time):
    try:
        str_time = str(time)
        return datetime.datetime.strptime(str_time, "%Y%m%d")
    except:
        return None

In [23]:
other_df = df[df['Vehicle Expiration Date'] != 0]
other_df.head()


Out[23]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Unregistered Vehicle? Vehicle Year Meter Number Feet From Curb Violation Post Code Violation Description No Standing or Stopping Violation Hydrant Violation Double Parking Violation New Issue Date
0 1283294138 GBB9093 NY PAS 08/04/2013 46 SUBN AUDI P 37250 ... 0 2013.0 - 0 NaN NaN NaN NaN NaN 2013-08-04
1 1283294151 62416MB NY COM 08/04/2013 46 VAN FORD P 37290 ... 0 2012.0 - 0 NaN NaN NaN NaN NaN 2013-08-04
2 1283294163 78755JZ NY COM 08/05/2013 46 P-U CHEVR P 37030 ... 0 NaN - 0 NaN NaN NaN NaN NaN 2013-08-05
3 1283294175 63009MA NY COM 08/05/2013 46 VAN FORD P 37270 ... 0 2010.0 - 0 NaN NaN NaN NaN NaN 2013-08-05
6 1283294229 GCR2838 NY PAS 08/11/2013 14 VAN NaN P 37250 ... 0 2011.0 - 0 NaN NaN NaN NaN NaN 2013-08-11

5 rows × 44 columns


In [24]:
other_df['New Vehicle Expiration Date']= other_df['Vehicle Expiration Date'].apply(convert_to_time)


/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [26]:
other_df.head()


Out[26]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Vehicle Year Meter Number Feet From Curb Violation Post Code Violation Description No Standing or Stopping Violation Hydrant Violation Double Parking Violation New Issue Date New Vehicle Expiration Date
0 1283294138 GBB9093 NY PAS 08/04/2013 46 SUBN AUDI P 37250 ... 2013.0 - 0 NaN NaN NaN NaN NaN 2013-08-04 2014-08-31
1 1283294151 62416MB NY COM 08/04/2013 46 VAN FORD P 37290 ... 2012.0 - 0 NaN NaN NaN NaN NaN 2013-08-04 2014-04-30
2 1283294163 78755JZ NY COM 08/05/2013 46 P-U CHEVR P 37030 ... NaN - 0 NaN NaN NaN NaN NaN 2013-08-05 2014-02-28
3 1283294175 63009MA NY COM 08/05/2013 46 VAN FORD P 37270 ... 2010.0 - 0 NaN NaN NaN NaN NaN 2013-08-05 2014-10-31
6 1283294229 GCR2838 NY PAS 08/11/2013 14 VAN NaN P 37250 ... 2011.0 - 0 NaN NaN NaN NaN NaN 2013-08-11 2014-12-23

5 rows × 45 columns

4. "Date first observed" is a pretty weird column, but it seems like it has a date hiding inside. Using a function with .apply, transform the string (e.g. "20140324") into a Python date. Make the 0's show up as NaN.


In [27]:
other_df.columns


Out[27]:
Index(['Summons Number', 'Plate ID', 'Registration State', 'Plate Type',
       'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make',
       'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3',
       'Vehicle Expiration Date', 'Violation Location', 'Violation Precinct',
       'Issuer Precinct', 'Issuer Code', 'Issuer Command', 'Issuer Squad',
       'Violation Time', 'Time First Observed', 'Violation County',
       'Violation In Front Of Or Opposite', 'House Number', 'Street Name',
       'Intersecting Street', 'Date First Observed', 'Law Section',
       'Sub Division', 'Violation Legal Code', 'Days Parking In Effect    ',
       'From Hours In Effect', 'To Hours In Effect', 'Vehicle Color',
       'Unregistered Vehicle?', 'Vehicle Year', 'Meter Number',
       'Feet From Curb', 'Violation Post Code', 'Violation Description',
       'No Standing or Stopping Violation', 'Hydrant Violation',
       'Double Parking Violation', 'New Issue Date',
       'New Vehicle Expiration Date'],
      dtype='object')

In [28]:
other_df['Date First Observed'].dtypes


Out[28]:
dtype('float64')

In [29]:
other_df['Date First Observed'].tail()


Out[29]:
990   NaN
991   NaN
993   NaN
994   NaN
995   NaN
Name: Date First Observed, dtype: float64

In [30]:
import dateutil

In [31]:
other_df['Date First Observed']


Out[31]:
0             NaN
1             NaN
2             NaN
3             NaN
6             NaN
8             NaN
10            NaN
11            NaN
12            NaN
13            NaN
15            NaN
16            NaN
17            NaN
18            NaN
19            NaN
20            NaN
21            NaN
23            NaN
25            NaN
26            NaN
28            NaN
29            NaN
35            NaN
36            NaN
38            NaN
39            NaN
41            NaN
42            NaN
45            NaN
47            NaN
          ...    
962           NaN
963           NaN
964           NaN
965           NaN
966           NaN
967           NaN
968           NaN
969           NaN
971           NaN
972           NaN
974           NaN
975           NaN
976           NaN
977           NaN
978           NaN
979           NaN
980    20130727.0
981           NaN
983           NaN
984    20130817.0
985           NaN
986           NaN
987           NaN
988    20130817.0
989           NaN
990           NaN
991           NaN
993           NaN
994           NaN
995           NaN
Name: Date First Observed, dtype: float64

In [32]:
other_df['Violation Time'].head()


Out[32]:
0    0752A
1    1240P
2    1243P
3    0232P
6    0741P
Name: Violation Time, dtype: object

In [33]:
other_df['Violation Time'].tail()


Out[33]:
990    0740P
991    0120A
993    1205A
994    0950P
995    0231P
Name: Violation Time, dtype: object

In [35]:
def int_to_date(integer):
    if not pd.isnull(integer):
        date = str(int(integer))
        parsed_date = dateutil.parser.parse(date)
        return parsed_date.strftime("%Y-%-m-%d")

In [36]:
other_df['Date First Observed'].apply(int_to_date)


Out[36]:
0           None
1           None
2           None
3           None
6           None
8           None
10          None
11          None
12          None
13          None
15          None
16          None
17          None
18          None
19          None
20          None
21          None
23          None
25          None
26          None
28          None
29          None
35          None
36          None
38          None
39          None
41          None
42          None
45          None
47          None
         ...    
962         None
963         None
964         None
965         None
966         None
967         None
968         None
969         None
971         None
972         None
974         None
975         None
976         None
977         None
978         None
979         None
980    2013-7-27
981         None
983         None
984    2013-8-17
985         None
986         None
987         None
988    2013-8-17
989         None
990         None
991         None
993         None
994         None
995         None
Name: Date First Observed, dtype: object

5. "Violation time" is... not a time. Make it a time.


In [37]:
def violation_time_to_time(time):
    try:
        hour = time[0:2]
        minutes = time[2:4]
        am_pm= time[4]
        regular_time= hour + ":" + minutes + " " + am_pm + 'm'
        violation_time_fixed = dateutil.parser.parse(regular_time)
        return violation_time_fixed.strftime("%H:%M%p")
    except:
        return None

In [38]:
other_df['Violation Time'].apply(violation_time_to_time)


Out[38]:
0      07:52AM
1      12:40PM
2      12:43PM
3      14:32PM
6      19:41PM
8      04:37AM
10     08:45AM
11     09:07AM
12     17:14PM
13     18:56PM
15     17:46PM
16     11:42AM
17     07:24AM
18     07:58AM
19     07:36AM
20     08:47AM
21     11:20AM
23     15:24PM
25     07:43AM
26     08:50AM
28     10:23AM
29     11:20AM
35     13:05PM
36     16:50PM
38     19:30PM
39     08:45AM
41     09:27AM
42     11:19AM
45     19:42PM
47     11:35AM
        ...   
962    09:42AM
963    11:31AM
964    12:38PM
965    12:07PM
966    15:51PM
967    14:05PM
968    11:49AM
969    14:28PM
971    10:59AM
972    10:50AM
974    08:55AM
975    19:30PM
976    13:21PM
977    13:15PM
978    21:50PM
979    01:53AM
980    15:50PM
981    21:53PM
983    20:10PM
984    20:40PM
985    20:45PM
986    21:25PM
987    19:00PM
988    19:00PM
989    19:00PM
990    19:40PM
991    01:20AM
993    00:05AM
994    21:50PM
995    14:31PM
Name: Violation Time, dtype: object

6. There sure are a lot of colors of cars, too bad so many of them are the same. Make "BLK" and "BLACK", "WT" and "WHITE", and any other combinations that you notice.


In [39]:
other_df['Vehicle Color'].value_counts()


Out[39]:
WHITE    85
BLACK    75
BLUE     53
GRAY     48
GREY     36
GY       36
BLK      33
SILVE    29
GREEN    29
BK       27
RED      24
WH       23
WHT      21
TAN      16
GOLD     14
BL       14
YELLO    11
GRY      10
SILVR     6
BROWN     4
G         3
MAROO     3
GR        3
BR        2
BEIGE     2
GL        2
GN        2
W         2
RD        2
TN        2
BIRG      1
BURGA     1
SIVLE     1
OR        1
YW        1
BRW       1
PURPL     1
BLU       1
BURG      1
G/Y       1
BRWN      1
YELL      1
RUST      1
PUR       1
Name: Vehicle Color, dtype: int64

In [41]:
def color_rename(color):
    if (color == 'BLACK') or (color == 'BLK') or (color == 'BK'):
        return 'BLACK'
    elif (color == 'WHITE') or (color == 'WHT') or (color == 'WH') or (color == 'W'):
        return 'WHITE'

other_df['Vehicle Color'].apply(color_rename)


Out[41]:
0       None
1      WHITE
2       None
3      WHITE
6       None
8      WHITE
10      None
11      None
12      None
13     WHITE
15     WHITE
16      None
17      None
18     WHITE
19     WHITE
20      None
21      None
23      None
25     WHITE
26     WHITE
28      None
29      None
35      None
36     WHITE
38      None
39      None
41      None
42      None
45     BLACK
47     BLACK
       ...  
962     None
963     None
964     None
965    BLACK
966     None
967     None
968     None
969    WHITE
971     None
972    BLACK
974     None
975     None
976     None
977     None
978     None
979    BLACK
980    WHITE
981    BLACK
983     None
984     None
985     None
986     None
987     None
988     None
989     None
990     None
991    BLACK
993     None
994     None
995    WHITE
Name: Vehicle Color, dtype: object

7. Join the data with the Parking Violations Code dataset from the NYC Open Data site.


In [42]:
parking_violations_df = pd.read_csv("DOF_Parking_Violation_Codes.csv", encoding="mac_roman", error_bad_lines=False)
parking_violations_df.head()


Out[42]:
CODE DEFINITION Manhattan  96th St. & below All Other Areas
0 10 Stopping, standing or parking where a sign, st... $115 $115
1 11 Hotel Loading/Unloading: Standing or parking w... $115 $115
2 12 Snow Emergency: Standing or parking where stan... $95 $95
3 13 Taxi Stand: Standing or parking where standing... $115 $115
4 14 General No Standing: Standing or parking where... $115 $115

In [43]:
parking_violations_df['CODE'].describe()


Out[43]:
count     84
unique    84
top       52
freq       1
Name: CODE, dtype: object

In [44]:
other_df['Violation Code'].describe()


Out[44]:
count    644.000000
mean      26.279503
std       16.009605
min       14.000000
25%       19.000000
50%       21.000000
75%       21.000000
max       99.000000
Name: Violation Code, dtype: float64

In [57]:
def convert_to_str(n):
    return str(n)

In [58]:
parking_violations_df['Code'] = parking_violations_df['CODE'].apply(convert_to_str)

In [59]:
other_df['Violation code'] = other_df['Violation Code'].apply(convert_to_str)


/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [60]:
parking_violations_df.head()


Out[60]:
CODE DEFINITION Manhattan  96th St. & below All Other Areas Code
0 10 Stopping, standing or parking where a sign, st... $115 $115 10
1 11 Hotel Loading/Unloading: Standing or parking w... $115 $115 11
2 12 Snow Emergency: Standing or parking where stan... $95 $95 12
3 13 Taxi Stand: Standing or parking where standing... $115 $115 13
4 14 General No Standing: Standing or parking where... $115 $115 14

In [61]:
updated_parking_violations_df = parking_violations_df.rename(columns={'Manhattan  96th St. & below': 'Manhattan 96th & below', 'All Other Areas': 'All other areas'})
updated_parking_violations_df.head()


Out[61]:
CODE DEFINITION Manhattan 96th & below All other areas Code
0 10 Stopping, standing or parking where a sign, st... $115 $115 10
1 11 Hotel Loading/Unloading: Standing or parking w... $115 $115 11
2 12 Snow Emergency: Standing or parking where stan... $95 $95 12
3 13 Taxi Stand: Standing or parking where standing... $115 $115 13
4 14 General No Standing: Standing or parking where... $115 $115 14

In [62]:
other_df.head()


Out[62]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Meter Number Feet From Curb Violation Post Code Violation Description No Standing or Stopping Violation Hydrant Violation Double Parking Violation New Issue Date New Vehicle Expiration Date Violation code
0 1283294138 GBB9093 NY PAS 08/04/2013 46 SUBN AUDI P 37250 ... - 0 NaN NaN NaN NaN NaN 2013-08-04 2014-08-31 46
1 1283294151 62416MB NY COM 08/04/2013 46 VAN FORD P 37290 ... - 0 NaN NaN NaN NaN NaN 2013-08-04 2014-04-30 46
2 1283294163 78755JZ NY COM 08/05/2013 46 P-U CHEVR P 37030 ... - 0 NaN NaN NaN NaN NaN 2013-08-05 2014-02-28 46
3 1283294175 63009MA NY COM 08/05/2013 46 VAN FORD P 37270 ... - 0 NaN NaN NaN NaN NaN 2013-08-05 2014-10-31 46
6 1283294229 GCR2838 NY PAS 08/11/2013 14 VAN NaN P 37250 ... - 0 NaN NaN NaN NaN NaN 2013-08-11 2014-12-23 14

5 rows × 46 columns


In [63]:
diff_violations_df = pd.merge(other_df, updated_parking_violations_df, left_on='Violation code', right_on='Code')
diff_violations_df.head()


Out[63]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Hydrant Violation Double Parking Violation New Issue Date New Vehicle Expiration Date Violation code CODE DEFINITION Manhattan 96th & below All other areas Code
0 1283294138 GBB9093 NY PAS 08/04/2013 46 SUBN AUDI P 37250 ... NaN NaN 2013-08-04 2014-08-31 46 46 Standing or parking on the roadway side of a v... $115 $115 46
1 1283294151 62416MB NY COM 08/04/2013 46 VAN FORD P 37290 ... NaN NaN 2013-08-04 2014-04-30 46 46 Standing or parking on the roadway side of a v... $115 $115 46
2 1283294163 78755JZ NY COM 08/05/2013 46 P-U CHEVR P 37030 ... NaN NaN 2013-08-05 2014-02-28 46 46 Standing or parking on the roadway side of a v... $115 $115 46
3 1283294175 63009MA NY COM 08/05/2013 46 VAN FORD P 37270 ... NaN NaN 2013-08-05 2014-10-31 46 46 Standing or parking on the roadway side of a v... $115 $115 46
4 1290834374 42587JW NY COM 07/30/2013 46 DELV FORD P 65590 ... NaN NaN 2013-07-30 2014-12-19 46 46 Standing or parking on the roadway side of a v... $115 $115 46

5 rows × 51 columns

8. How much money did NYC make off of parking violations?


In [64]:
diff_violations_df['Manhattan 96th & below'].describe()


Out[64]:
count      641
unique       8
top       $65 
freq       344
Name: Manhattan 96th & below, dtype: object

In [65]:
diff_violations_df['All other areas'].describe()


Out[65]:
count      641
unique       9
top       $45 
freq       246
Name: All other areas, dtype: object

In [66]:
diff_violations_df['Manhattan 96th & below'].apply(convert_to_str).head()


Out[66]:
0    $115 
1    $115 
2    $115 
3    $115 
4    $115 
Name: Manhattan 96th & below, dtype: object

In [68]:
diff_violations_df['All other areas'].apply(convert_to_str).head()


Out[68]:
0    $115 
1    $115 
2    $115 
3    $115 
4    $115 
Name: All other areas, dtype: object

In [69]:
diff_violations_df = new_violations_df[new_violations_df['Manhattan 96th & below'] != 'vary']
diff_violations_df.head()


Out[69]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Hydrant Violation Double Parking Violation New Issue Date New Vehicle Expiration Date Violation code CODE DEFINITION Manhattan 96th & below All other areas Code
0 1283294138 GBB9093 NY PAS 08/04/2013 46 SUBN AUDI P 37250 ... NaN NaN 2013-08-04 2014-08-31 46 46 Standing or parking on the roadway side of a v... $115 $115 46
1 1283294151 62416MB NY COM 08/04/2013 46 VAN FORD P 37290 ... NaN NaN 2013-08-04 2014-04-30 46 46 Standing or parking on the roadway side of a v... $115 $115 46
2 1283294163 78755JZ NY COM 08/05/2013 46 P-U CHEVR P 37030 ... NaN NaN 2013-08-05 2014-02-28 46 46 Standing or parking on the roadway side of a v... $115 $115 46
3 1283294175 63009MA NY COM 08/05/2013 46 VAN FORD P 37270 ... NaN NaN 2013-08-05 2014-10-31 46 46 Standing or parking on the roadway side of a v... $115 $115 46
4 1290834374 42587JW NY COM 07/30/2013 46 DELV FORD P 65590 ... NaN NaN 2013-07-30 2014-12-19 46 46 Standing or parking on the roadway side of a v... $115 $115 46

5 rows × 51 columns


In [70]:
import re
def strip_and_convert_to_int(string):
    match = re.findall(r"^\$?\d*", string)
    if match:
        new_string = string.replace("$", "").split()
        new_int = int(new_string[0])
        return new_int
    else:
        return None

In [71]:
diff_violations_df['Manhattan 96th and below'] = diff_violations_df['Manhattan 96th & below'].apply(strip_and_convert_to_int)


/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [72]:
diff_violations_df.head()


Out[72]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... Double Parking Violation New Issue Date New Vehicle Expiration Date Violation code CODE DEFINITION Manhattan 96th & below All other areas Code Manhattan 96th and below
0 1283294138 GBB9093 NY PAS 08/04/2013 46 SUBN AUDI P 37250 ... NaN 2013-08-04 2014-08-31 46 46 Standing or parking on the roadway side of a v... $115 $115 46 115
1 1283294151 62416MB NY COM 08/04/2013 46 VAN FORD P 37290 ... NaN 2013-08-04 2014-04-30 46 46 Standing or parking on the roadway side of a v... $115 $115 46 115
2 1283294163 78755JZ NY COM 08/05/2013 46 P-U CHEVR P 37030 ... NaN 2013-08-05 2014-02-28 46 46 Standing or parking on the roadway side of a v... $115 $115 46 115
3 1283294175 63009MA NY COM 08/05/2013 46 VAN FORD P 37270 ... NaN 2013-08-05 2014-10-31 46 46 Standing or parking on the roadway side of a v... $115 $115 46 115
4 1290834374 42587JW NY COM 07/30/2013 46 DELV FORD P 65590 ... NaN 2013-07-30 2014-12-19 46 46 Standing or parking on the roadway side of a v... $115 $115 46 115

5 rows × 52 columns


In [73]:
diff_violations_df['All Other Areas'] = diff_violations_df['All other areas'].apply(strip_and_convert_to_int)
diff_violations_df.tail()


/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
Out[73]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... New Issue Date New Vehicle Expiration Date Violation code CODE DEFINITION Manhattan 96th & below All other areas Code Manhattan 96th and below All Other Areas
635 1333609826 GCL1507 NY PAS 07/18/2013 75 SDN NISSA T 89930 ... 2013-07-18 2015-04-29 75 75 Standing or parking a vehicle in which the Lic... $65 $65 75 65 65
636 1308157320 GFS4162 NY PAS 07/19/2013 80 SUBN CADIL P 60810 ... 2013-07-19 2014-01-21 80 80 Standing or parking a vehicle without head la... $60 $45 80 60 45
637 1308157344 GER6778 NY PAS 07/19/2013 61 SDN HYUND P 22210 ... 2013-07-19 2015-02-01 61 61 Except where angle parking is allowed, stoppin... $65 $45 61 65 45
638 1334736820 GGN3919 NY OMS 08/13/2013 61 SDN FORD X 6380 ... 2013-08-13 2014-09-30 61 61 Except where angle parking is allowed, stoppin... $65 $45 61 65 45
639 1333925347 97431JZ NY COM 08/09/2013 45 DELV HINO T 18170 ... 2013-08-09 2013-12-31 45 45 Stopping, standing or parking in a traffic lan... $115 $115 45 115 115

5 rows × 53 columns


In [74]:
diff_violations_df['All Other Areas'].value_counts().head()


Out[74]:
45     246
115    210
60      83
95      75
65      16
Name: All Other Areas, dtype: int64

In [75]:
manhattan_violations = diff_violations_df.groupby('Violation code')['All Other Areas'].sum()
manhattan_violations.sum()


Out[75]:
50150

In [76]:
violations_not_man = diff_violations_df.groupby('Violation code')['Manhattan 96th and below'].sum()
violations_not_man.sum()


Out[76]:
55180

In [77]:
violations_revenue = violations_not_man.sum() + manhattan_violations.sum()

In [78]:
violations_revenue


Out[78]:
105330

9. What's the most lucrative kind of parking violation? The most frequent?


In [79]:
manhattan_violations.sort_values(ascending=False)


Out[79]:
Violation code
21    10935
14     8165
19     6440
17     5510
46     5405
20     3180
40     2760
24     1740
67      825
51      690
94      600
98      570
63      570
16      475
71      455
27      360
50      345
74      325
78      195
31      115
18      115
45      115
61       90
75       65
39       60
80       45
Name: All Other Areas, dtype: int64

In [80]:
violations_not_man.sort_values(ascending=False)


Out[80]:
Violation code
21    15795
14     8165
19     6440
17     5510
46     5405
20     3445
40     2760
24     1885
67      825
51      690
98      570
63      570
16      475
71      455
27      360
50      345
74      325
94      300
78      195
61      130
31      115
18      115
45      115
39       65
75       65
80       60
Name: Manhattan 96th and below, dtype: int64

In [81]:
new_violations_df['Violation code'].value_counts()


Out[81]:
21    243
14     71
17     58
19     56
20     53
46     47
24     29
40     24
71      7
63      6
98      6
51      6
74      5
67      5
16      5
94      3
50      3
78      3
27      2
61      2
45      1
31      1
75      1
39      1
99      1
80      1
18      1
Name: Violation code, dtype: int64

10. New Jersey has bad drivers, but does it have bad parkers, too? How much money does NYC make off of all non-New York vehicles?


In [82]:
out_of_staters_df = diff_violations_df[diff_violations_df['Registration State'] != 'NY']
out_of_staters_df.head()


Out[82]:
Summons Number Plate ID Registration State Plate Type Issue Date Violation Code Vehicle Body Type Vehicle Make Issuing Agency Street Code1 ... New Issue Date New Vehicle Expiration Date Violation code CODE DEFINITION Manhattan 96th & below All other areas Code Manhattan 96th and below All Other Areas
17 1307452590 40470JU 99 COM 08/12/2013 46 DELV MERCU P 10110 ... 2013-08-12 2014-06-30 46 46 Standing or parking on the roadway side of a v... $115 $115 46 115 115
30 1295551032 YZR9642 PA PAS 07/29/2013 46 DELV FREIT P 28265 ... 2013-07-29 2013-08-31 46 46 Standing or parking on the roadway side of a v... $115 $115 46 115 115
44 1334596608 GEB5618 99 PAS 08/13/2013 46 SUBN HONDA S 36690 ... 2013-08-13 2015-01-11 46 46 Standing or parking on the roadway side of a v... $115 $115 46 115 115
77 1333711451 GPJ3413 PA PAS 07/07/2013 14 SDN ACURA T 34850 ... 2013-07-07 NaT 14 14 General No Standing: Standing or parking where... $115 $115 14 115 115
129 1308415473 H2749F 99 999 07/17/2013 24 SUBN JEEP X 0 ... 2013-07-17 2013-11-30 24 24 Authorized Vehicles Only: No parking where par... $65 $60 24 65 60

5 rows × 53 columns


In [83]:
out_of_staters_other = out_of_staters_df.groupby('Violation code')['All Other Areas'].sum()
out_of_staters_other.sum()


Out[83]:
7215

In [84]:
out_of_staters_manhattan= out_of_staters_df.groupby('Violation code')['Manhattan 96th and below'].sum()
out_of_staters_manhattan.sum()


Out[84]:
7960

In [85]:
total_out_of_staters_violations = out_of_staters_other.sum()+ out_of_staters_manhattan.sum()
total_out_of_staters_violations


Out[85]:
15175

11. Make a chart of the top few.


In [87]:
%matplotlib inline

In [88]:
out_of_staters_other.sort_values(ascending=False).plot(kind='bar', x='Violation code')


Out[88]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e8f0ba8>

In [89]:
out_of_staters_manhattan.sort_values(ascending=False).plot(kind='bar', x='Violation code')


Out[89]:
<matplotlib.axes._subplots.AxesSubplot at 0x10daf80b8>

12. What time of day do people usually get their tickets? You can break the day up into several blocks - for example 12am-6am, 6am-12pm, 12pm-6pm, 6pm-12am.


In [ ]:

13. What's the average ticket cost in NYC?


In [91]:
average_tix_price = total_out_of_staters_violations / diff_violations_df['Violation code'].value_counts().sum()
average_tix_price


Out[91]:
23.7109375

14. Make a graph of the number of tickets per day.


In [92]:
diff_violations_df['Issue Date'].value_counts().head(10).plot(kind='barh')


Out[92]:
<matplotlib.axes._subplots.AxesSubplot at 0x10dbc8198>

15. Make a graph of the amount of revenue collected per day.


In [93]:
daily_revenue = total_out_of_staters_violations / new_violations_df['New Issue Date'].value_counts()
daily_revenue.sort_values(ascending=False).head(20).plot(kind='bar')


Out[93]:
<matplotlib.axes._subplots.AxesSubplot at 0x10f244eb8>

16. Manually construct a dataframe out of https://dmv.ny.gov/statistic/2015licinforce-web.pdf (only NYC boroughts - bronx, queens, manhattan, staten island, brooklyn), having columns for borough name, abbreviation, and number of licensed drivers.


In [95]:
nyc_licenses = pd.read_excel("NYC.xlsx")
nyc_licenses


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-95-93dcaeb850e2> in <module>()
----> 1 nyc_licenses = pd.read_excel("NYC.xlsx")
      2 nyc_licenses

/Users/skkandrach/.virtualenvs/lede/lib/python3.5/site-packages/pandas/io/excel.py in read_excel(io, sheetname, header, skiprows, skip_footer, index_col, names, parse_cols, parse_dates, date_parser, na_values, thousands, convert_float, has_index_names, converters, engine, squeeze, **kwds)
    168     """
    169     if not isinstance(io, ExcelFile):
--> 170         io = ExcelFile(io, engine=engine)
    171 
    172     return io._parse_excel(

/Users/skkandrach/.virtualenvs/lede/lib/python3.5/site-packages/pandas/io/excel.py in __init__(self, io, **kwds)
    225             self.book = xlrd.open_workbook(file_contents=data)
    226         elif isinstance(io, compat.string_types):
--> 227             self.book = xlrd.open_workbook(io)
    228         else:
    229             raise ValueError('Must explicitly set engine if not passing in'

/Users/skkandrach/.virtualenvs/lede/lib/python3.5/site-packages/xlrd/__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
    393         peek = file_contents[:peeksz]
    394     else:
--> 395         with open(filename, "rb") as f:
    396             peek = f.read(peeksz)
    397     if peek == b"PK\x03\x04": # a ZIP file

FileNotFoundError: [Errno 2] No such file or directory: 'NYC.xlsx'

17. What's the parking-ticket-$-per-licensed-driver in each borough of NYC? Do this with pandas and the dataframe you just made, not with your head!


In [98]:
diff_violations_df.columns


Out[98]:
Index(['Summons Number', 'Plate ID', 'Registration State', 'Plate Type',
       'Issue Date', 'Violation Code', 'Vehicle Body Type', 'Vehicle Make',
       'Issuing Agency', 'Street Code1', 'Street Code2', 'Street Code3',
       'Vehicle Expiration Date', 'Violation Location', 'Violation Precinct',
       'Issuer Precinct', 'Issuer Code', 'Issuer Command', 'Issuer Squad',
       'Violation Time', 'Time First Observed', 'Violation County',
       'Violation In Front Of Or Opposite', 'House Number', 'Street Name',
       'Intersecting Street', 'Date First Observed', 'Law Section',
       'Sub Division', 'Violation Legal Code', 'Days Parking In Effect    ',
       'From Hours In Effect', 'To Hours In Effect', 'Vehicle Color',
       'Unregistered Vehicle?', 'Vehicle Year', 'Meter Number',
       'Feet From Curb', 'Violation Post Code', 'Violation Description',
       'No Standing or Stopping Violation', 'Hydrant Violation',
       'Double Parking Violation', 'New Issue Date',
       'New Vehicle Expiration Date', 'Violation code', 'CODE', 'DEFINITION',
       'Manhattan 96th & below', 'All other areas', 'Code',
       'Manhattan 96th and below', 'All Other Areas'],
      dtype='object')

In [99]:
diff_violations_df['Violation County'].value_counts()


Out[99]:
K     240
NY    152
Q     138
BX     53
R      12
Name: Violation County, dtype: int64

In [100]:
bronx_violations = diff_violations_df[diff_violations_df['Violation County'] == 'BX']
bronx_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'BX']
bronx_tix = bronx_violations.groupby('Violation code')['All Other Areas'].sum()
driver_bronx_tix = bronx_licenses / bronx_tix.sum()
driver_bronx_tix


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-100-6921bd6f4052> in <module>()
      1 bronx_violations = diff_violations_df[diff_violations_df['Violation County'] == 'BX']
----> 2 bronx_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'BX']
      3 bronx_tix = bronx_violations.groupby('Violation code')['All Other Areas'].sum()
      4 driver_bronx_tix = bronx_licenses / bronx_tix.sum()
      5 driver_bronx_tix

NameError: name 'nyc_licenses' is not defined

In [101]:
queens_violations = diff_violations_df[diff_violations_df['Violation County'] == 'Q']
queens_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'Q']
queens_tix = queens_violations.groupby('Violation code')['All Other Areas'].sum()
driver_queens_tix = queens_licenses / queens_tix.sum()
driver_queens_tix


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-101-c186341a5b90> in <module>()
      1 queens_violations = diff_violations_df[diff_violations_df['Violation County'] == 'Q']
----> 2 queens_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'Q']
      3 queens_tix = queens_violations.groupby('Violation code')['All Other Areas'].sum()
      4 driver_queens_tix = queens_licenses / queens_tix.sum()
      5 driver_queens_tix

NameError: name 'nyc_licenses' is not defined

In [102]:
ny_violations = diff_violations_df[diff_violations_df['Violation County'] == 'NY']
ny_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'NY']
ny_tix = ny_violations.groupby('Violation code')['All Other Areas'].sum()
driver_ny_tix = ny_licenses / ny_tix.sum()
driver_ny_tix


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-102-97a57230fe91> in <module>()
      1 ny_violations = diff_violations_df[diff_violations_df['Violation County'] == 'NY']
----> 2 ny_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'NY']
      3 ny_tix = ny_violations.groupby('Violation code')['All Other Areas'].sum()
      4 driver_ny_tix = ny_licenses / ny_tix.sum()
      5 driver_ny_tix

NameError: name 'nyc_licenses' is not defined

In [103]:
brooklyn_violations = diff_violations_df[diff_violations_df['Violation County'] == 'R']
brooklyn_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'R']
brooklyn_tix = brooklyn_violations.groupby('Violation code')['All Other Areas'].sum()
driver_brooklyn_tix = brooklyn_licenses / brooklyn_tix.sum()
driver_brooklyn_tix


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-103-c47f9af75edb> in <module>()
      1 brooklyn_violations = diff_violations_df[diff_violations_df['Violation County'] == 'R']
----> 2 brooklyn_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'R']
      3 brooklyn_tix = brooklyn_violations.groupby('Violation code')['All Other Areas'].sum()
      4 driver_brooklyn_tix = brooklyn_licenses / brooklyn_tix.sum()
      5 driver_brooklyn_tix

NameError: name 'nyc_licenses' is not defined

In [104]:
staten_is_violations = diff_violations_df[diff_violations_df['Violation County'] == 'K']
staten_is_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'K']
staten_is_tix = violations_kings.groupby('Violation code')['All Other Areas'].sum()
driver_staten_is_tix = staten_is_licenses / staten_is_tix.sum()
driver_staten_is_tix


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-104-2846d34744b9> in <module>()
      1 staten_is_violations = diff_violations_df[diff_violations_df['Violation County'] == 'K']
----> 2 staten_is_licenses = nyc_licenses['Total'][nyc_licenses['Abbreviation'] == 'K']
      3 staten_is_tix = violations_kings.groupby('Violation code')['All Other Areas'].sum()
      4 driver_staten_is_tix = staten_is_licenses / staten_is_tix.sum()
      5 driver_staten_is_tix

NameError: name 'nyc_licenses' is not defined

In [ ]: