In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("Civil_List_2014.csv",
                 names={"DPT","NAME","ADDRESS"})

In [6]:
df.head()


Out[6]:
DPT NAME ADDRESS TTL # PC SAL-RATE
0 868 B J SANDIFORD DEPARTMENT OF CITYWIDE ADM 12702 X $5.00
1 868 C A WIGFALL DEPARTMENT OF CITYWIDE ADM 12702 X $5.00
2 69 A E A-AWOSOGBA HRA/DEPARTMENT OF SOCIAL S 52311 A $51955.00
3 868 K D AABY DEPARTMENT OF CITYWIDE ADM 10209 X $12.00
4 56 I D AADIL POLICE DEPARTMENT 71012 A $46953.00

In [7]:
df['SAL-RATE'].describe()


Out[7]:
count        269838
unique        27533
top       $76488.00
freq          26024
Name: SAL-RATE, dtype: object

In [8]:
df.columns


Out[8]:
Index(['DPT     ', 'NAME    ', 'ADDRESS ', 'TTL #   ', 'PC      ', 'SAL-RATE'], dtype='object')

In [10]:
"DPT   ".strip()


Out[10]:
'DPT'

In [11]:
for col in df.columns:
    print(col.strip())


DPT
NAME
ADDRESS
TTL #
PC
SAL-RATE

In [12]:
# its a list, so-[]

[col.strip() for col in df.columns]


Out[12]:
['DPT', 'NAME', 'ADDRESS', 'TTL #', 'PC', 'SAL-RATE']

In [13]:
df.columns = [col.strip() for col in df.columns]

In [14]:
df.columns


Out[14]:
Index(['DPT', 'NAME', 'ADDRESS', 'TTL #', 'PC', 'SAL-RATE'], dtype='object')

In [15]:
df['PC'].value_counts()


Out[15]:
A     100521
D      54881
X      24403
4      23259
3      19866
1      13083
K       7635
F       6361
N       4671
U       3409
B       2817
BW      1766
E       1300
W       1257
G       1085
I        730
2        721
M        708
O        536
WK       488
P        307
L         25
S          4
0          3
V          2
Name: PC, dtype: int64

In [16]:
df['SAL-RATE'].describe()


Out[16]:
count        269838
unique        27533
top       $76488.00
freq          26024
Name: SAL-RATE, dtype: object

In [18]:
def money_to_float(money_str):
    return float(money_str.replace("$","").replace(",",""))
print(money_to_float("$7888.00"))


7888.0

In [19]:
#money_to_float(df['SAL-RATE'])
df['SAL-RATE'].apply(money_to_float)


Out[19]:
0              5.00
1              5.00
2          51955.00
3             12.00
4          46953.00
5          33000.00
6             73.53
7          33280.00
8          60878.00
9          69272.00
10         89960.00
11         38324.00
12         39019.00
13           369.92
14         66127.00
15         64250.00
16         32510.00
17             2.65
18        109087.00
19        111606.00
20             5.00
21         56555.00
22         51643.00
23         77015.00
24         76488.00
25         67627.00
26         68095.00
27         66127.00
28         76689.00
29         48153.00
            ...    
269808     85287.00
269809     65564.00
269810     56937.00
269811     81439.00
269812    118506.00
269813     42064.00
269814        33.18
269815    116364.00
269816     98072.00
269817    107563.00
269818     37150.00
269819     74646.00
269820     64040.00
269821     76488.00
269822     83802.00
269823        17.16
269824     85667.00
269825        46.89
269826        67.42
269827     51916.00
269828     57514.00
269829     74605.00
269830     74605.00
269831     64403.00
269832     51936.00
269833        47.58
269834     87738.00
269835     78683.00
269836     76488.00
269837         5.00
Name: SAL-RATE, dtype: float64

In [20]:
df['salary']= df['SAL-RATE'].apply(money_to_float)

In [21]:
df.head()


Out[21]:
DPT NAME ADDRESS TTL # PC SAL-RATE salary
0 868 B J SANDIFORD DEPARTMENT OF CITYWIDE ADM 12702 X $5.00 5.0
1 868 C A WIGFALL DEPARTMENT OF CITYWIDE ADM 12702 X $5.00 5.0
2 69 A E A-AWOSOGBA HRA/DEPARTMENT OF SOCIAL S 52311 A $51955.00 51955.0
3 868 K D AABY DEPARTMENT OF CITYWIDE ADM 10209 X $12.00 12.0
4 56 I D AADIL POLICE DEPARTMENT 71012 A $46953.00 46953.0

In [23]:
%matplotlib inline

In [26]:
df['salary'].hist(bins=50) #bins is 50 bars


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x1172843c8>

In [27]:
df['ADDRESS'].value_counts()


Out[27]:
N.Y.C. TRANSIT AUTHORITY      56932
POLICE DEPARTMENT             51453
FIRE DEPARTMENT               15848
DEPARTMENT OF CITYWIDE ADM    15739
HRA/DEPARTMENT OF SOCIAL S    13676
DEPARTMENT OF EDUCATION       11689
N.Y.C. HOUSING AUTHORITY      11501
DEPARTMENT OF CORRECTION      10188
DEPARTMENT OF SANITATION       9666
DEPARTMENT OF PARKS & RECR     7261
ADMINISTRATION FOR CHILDRE     6033
DEPARTMENT OF ENVIRONMENTA     5717
DEPARTMENT OF HEALTH AND M     5676
DEPARTMENT OF TRANSPORTATI     4589
CUNY LAGUARDIA COMMUNITY C     3379
CUNY KINGSBOROUGH COMMMUNI     3260
CUNY MANHATTAN COMMUNITY C     2974
TRIBOROUGH BRIDGE AND TUNN     2254
HOUSING PRESERVATION & DEV     2078
CUNY QUEENSBOROUGH COMMUNI     2030
DEPARTMENT OF HOMELESS SER     1966
DEPARTMENT OF FINANCE          1953
CUNY BRONX COMMUNITY COLLE     1923
LAW DEPARTMENT                 1514
DISTRICT ATTORNEY-NEW YORK     1384
CUNY HOSTOS COMMUNITY COLL     1295
DEPARTMENT OF DESIGN AND C     1193
DEPARTMENT OF INFORMATION      1162
DEPARTMENT OF BUILDINGS        1111
DISTRICT ATTORNEY-KINGS CO     1062
                              ...  
COMMUNITY BOARD NO.3-RICHM        3
COMMUNITY BOARD NO.10-BROO        3
COMMUNITY BOARD NO.14-QUEE        3
COMMUNITY BOARD NO.11-QUEE        3
COMMUNITY BOARD NO.4 BRONX        3
COMMUNITY BOARD N0.9-MANHA        3
COMMUNITY BOARD NO.14-BROO        3
COMMUNITY BOARD NO.1-QUEEN        3
COMMUNITY BOARD NO.9-BRONX        3
COMMUNITY BOARD NO.1-BROOK        3
COMMUNITY BOARD NO.5-MANHA        3
COMMUNITY BOARD NO.1-RICHM        3
COMMUNITY BOARD NO.3-BROOK        3
COMMUNITY BOARD NO.12-QUEE        3
COMMUNITY BOARD NO.2-BRONX        3
COMMUNITY BOARD NO.13-QUEE        3
COMMUNITY BOARD NO.6-BROOK        3
COMMUNITY BOARD NO.13-BROO        3
COMMUNITY BOARD NO.7-BRONX        3
COMMUNITY BOARD NO.12-BRON        3
COMMUNITY BOARD NO.1 BRONX        2
COMMUNITY BOARD NO.9-BROOK        2
COMMUNITY BOARD NO.9-QUEEN        2
COMMUNITY BOARD NO.6-MANHA        2
COMMUNITY BOARD NO.16-BROO        2
CUNY MEDGAR EVERS COLLEGE         2
COMMUNITY BOARD NO.5-BRONX        2
COMMUNITY BOARD NO.6-BRONX        2
COMMUNITY BOARD NO.3-BRONX        2
COMMUNITY BOARD NO.15-BROO        2
Name: ADDRESS, dtype: int64

In [29]:
agencies_df = pd.read_csv("cleaned-agencies.csv", dtype='str')

In [30]:
agencies_df.head()


Out[30]:
code agency is_mayoral
0 002 OFFICE OF THE MAYOR N
1 003 BOARD OF ELECTIONS Y
2 004 CAMPAIGN FINANCE BOARD Y
3 005 BOARD OF ESTIMATE-SECRETARY Y
4 006 MAYOR'S OFFICE OF OPERATIONS N

In [31]:
col_types={'code':'str'}
agencies_df = pd.read_csv("cleaned-agencies.csv", dtype='str', true_values='Y', false_values='N')
agencies_df.head()


Out[31]:
code agency is_mayoral
0 002 OFFICE OF THE MAYOR N
1 003 BOARD OF ELECTIONS Y
2 004 CAMPAIGN FINANCE BOARD Y
3 005 BOARD OF ESTIMATE-SECRETARY Y
4 006 MAYOR'S OFFICE OF OPERATIONS N

In [ ]:
agencies_df = pd.read_csv("cleaned-agencies.csv", na_values="-999", true_values='Y', false_values='N')

In [ ]:
df.merge(agencies_df)