In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier

import pandas as pd

In [15]:
df = pd.read_csv('stage1WithDumWithDemo.csv')
label = 'no_affidavit'
X, y = df.drop(label, axis=1), df[label]
X = X.drop(['officer_gender', 'officer_race', 'rank', 'complainant_gender', 'complainant_race', 'crid', 'Unnamed: 0'], axis=1)
X = X.fillna(df.mean())
X


---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-15-3a41ae9d237a> in <module>()
----> 1 df = pd.read_csv('stage1WithDumWithDemo.csv')
      2 label = 'no_affidavit'
      3 X, y = df.drop(label, axis=1), df[label]
      4 X = X.drop(['officer_gender', 'officer_race', 'rank', 'complainant_gender', 'complainant_race', 'crid', 'Unnamed: 0'], axis=1)
      5 X = X.fillna(df.mean())

/home/christine/anaconda3/envs/hw/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
    496                     skip_blank_lines=skip_blank_lines)
    497 
--> 498         return _read(filepath_or_buffer, kwds)
    499 
    500     parser_f.__name__ = name

/home/christine/anaconda3/envs/hw/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    273 
    274     # Create the parser.
--> 275     parser = TextFileReader(filepath_or_buffer, **kwds)
    276 
    277     if (nrows is not None) and (chunksize is not None):

/home/christine/anaconda3/envs/hw/lib/python3.5/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    588             self.options['has_index_names'] = kwds['has_index_names']
    589 
--> 590         self._make_engine(self.engine)
    591 
    592     def _get_options_with_defaults(self, engine):

/home/christine/anaconda3/envs/hw/lib/python3.5/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    729     def _make_engine(self, engine='c'):
    730         if engine == 'c':
--> 731             self._engine = CParserWrapper(self.f, **self.options)
    732         else:
    733             if engine == 'python':

/home/christine/anaconda3/envs/hw/lib/python3.5/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1101         kwds['allow_leading_cols'] = self.index_col is not False
   1102 
-> 1103         self._reader = _parser.TextReader(src, **kwds)
   1104 
   1105         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3246)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:6111)()

OSError: File b'stage1WithDumWithDemo.csv' does not exist

In [3]:
X.shape


Out[3]:
(18721, 109)

In [4]:
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

In [5]:
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [6]:
columns = X.columns
print("Feature ranking:")
for f in range(X.shape[1]):
    
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], columns[f], importances[indices[f]]))


Feature ranking:
1. feature 4 officer_id (0.070387)
2. feature 89 beat (0.065578)
3. feature 0 investigator_id (0.050727)
4. feature 5 weekend (0.011651)
5. feature 108 police_investigator (0.010848)
6. feature 6 officers_age (0.010600)
7. feature 42 agesqrd (0.010305)
8. feature 88 rodents_count_7days500m (0.009817)
9. feature 84 rodents_count_3months500m (0.009786)
10. feature 18 rodents_count_7days1000m (0.009625)
11. feature 40 rodents_count_3months1000m (0.009464)
12. feature 47 rodents_count_7days2500m (0.009291)
13. feature 48 rodents_count_3months2500m (0.009162)
14. feature 59 garbage_count_7days500m (0.009143)
15. feature 46 garbage_count_3months500m (0.009085)
16. feature 85 garbage_count_7days1000m (0.009038)
17. feature 2 garbage_count_3months1000m (0.008990)
18. feature 81 garbage_count_7days2500m (0.008965)
19. feature 38 garbage_count_3months2500m (0.008964)
20. feature 65 sanitation_count_7days500m (0.008869)
21. feature 76 sanitation_count_3months500m (0.008792)
22. feature 52 sanitation_count_7days1000m (0.008715)
23. feature 22 sanitation_count_3months1000m (0.008679)
24. feature 41 sanitation_count_7days2500m (0.008665)
25. feature 24 sanitation_count_3months2500m (0.008616)
26. feature 50 alleylights_count_7days500m (0.008610)
27. feature 44 alleylights_count_3months500m (0.008606)
28. feature 23 alleylights_count_7days1000m (0.008565)
29. feature 17 alleylights_count_3months1000m (0.008543)
30. feature 53 alleylights_count_7days2500m (0.008524)
31. feature 80 alleylights_count_3months2500m (0.008428)
32. feature 14 vacantbuildings_count_7days500m (0.008304)
33. feature 72 vacantbuildings_count_3months500m (0.008297)
34. feature 20 vacantbuildings_count_7days1000m (0.008295)
35. feature 15 vacantbuildings_count_3months1000m (0.008183)
36. feature 30 vacantbuildings_count_7days2500m (0.008173)
37. feature 77 vacantbuildings_count_3months2500m (0.008168)
38. feature 61 streetlights_all_count_7days500m (0.008165)
39. feature 16 streetlights_all_count_3months500m (0.008117)
40. feature 45 streetlights_all_count_7days1000m (0.008101)
41. feature 64 streetlights_all_count_3months1000m (0.008050)
42. feature 63 streetlights_all_count_7days2500m (0.007992)
43. feature 58 streetlights_all_count_3months2500m (0.007940)
44. feature 21 vehicles_count_7days500m (0.007927)
45. feature 70 vehicles_count_3months500m (0.007920)
46. feature 32 vehicles_count_7days1000m (0.007864)
47. feature 13 vehicles_count_3months1000m (0.007853)
48. feature 29 vehicles_count_7days2500m (0.007821)
49. feature 68 vehicles_count_3months2500m (0.007787)
50. feature 28 streetlights_one_count_7days500m (0.007774)
51. feature 71 streetlights_one_count_3months500m (0.007755)
52. feature 51 streetlights_one_count_7days1000m (0.007704)
53. feature 60 streetlights_one_count_3months1000m (0.007658)
54. feature 82 streetlights_one_count_7days2500m (0.007627)
55. feature 73 treetrims_count_7days500m (0.007626)
56. feature 62 treetrims_count_3months500m (0.007588)
57. feature 26 treetrims_count_7days1000m (0.007577)
58. feature 36 treetrims_count_3months1000m (0.007568)
59. feature 35 treetrims_count_7days2500m (0.007536)
60. feature 87 treetrims_count_3months2500m (0.007497)
61. feature 86 potholes_count_7days500m (0.007489)
62. feature 39 potholes_count_3months500m (0.007489)
63. feature 57 potholes_count_7days1000m (0.007450)
64. feature 78 potholes_count_3months1000m (0.007363)
65. feature 12 potholes_count_7days2500m (0.007340)
66. feature 56 potholes_count_3months2500m (0.007307)
67. feature 67 graffiti_count_7days500m (0.007305)
68. feature 11 graffiti_count_3months500m (0.007295)
69. feature 49 graffiti_count_7days1000m (0.007278)
70. feature 74 graffiti_count_3months1000m (0.007254)
71. feature 27 graffiti_count_7days2500m (0.007242)
72. feature 34 graffiti_count_3months2500m (0.007242)
73. feature 55 crimes_03_14days500m (0.007185)
74. feature 10 crimes_06_14days500m (0.007142)
75. feature 9 crimes_18_14days500m (0.007137)
76. feature 7 crimes_22_14days500m (0.007114)
77. feature 90 crimes_03_3months500m (0.007083)
78. feature 106 crimes_06_3months500m (0.007078)
79. feature 94 crimes_18_3months500m (0.007061)
80. feature 8 crimes_22_3months500m (0.007041)
81. feature 37 crimes_03_14days1000m (0.007031)
82. feature 1 crimes_06_14days1000m (0.006988)
83. feature 69 crimes_18_14days1000m (0.006917)
84. feature 95 crimes_22_14days1000m (0.006907)
85. feature 33 crimes_03_3months1000m (0.006842)
86. feature 19 crimes_06_3months1000m (0.006840)
87. feature 91 crimes_18_3months1000m (0.006818)
88. feature 105 crimes_22_3months1000m (0.006794)
89. feature 96 crimes_03_14days2500m (0.006762)
90. feature 102 priors (0.006716)
91. feature 66 pct017 (0.006670)
92. feature 43 pct1824 (0.006649)
93. feature 93 pct2534 (0.006607)
94. feature 100 pct3544 (0.006603)
95. feature 104 pct4554 (0.006578)
96. feature 98 pct5564 (0.006572)
97. feature 54 pct6500 (0.006552)
98. feature 103 ptnla (0.006541)
99. feature 92 ptnlb (0.006483)
100. feature 99 ptnlwh (0.006449)
101. feature 101 ptnloth (0.006379)
102. feature 25 ptl (0.006266)
103. feature 107 ptlths (0.006205)
104. feature 3 pthsged (0.005898)
105. feature 97 ptsomeco (0.005594)
106. feature 31 ptbaplus (0.005546)
107. feature 79 ptpov (0.005047)
108. feature 83 pctfb (0.004065)
109. feature 75 complainant_age (0.001883)

In [7]:
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()



In [9]:
indices


Out[9]:
array([  4,  89,   0,   5, 108,   6,  42,  88,  84,  18,  40,  47,  48,
        59,  46,  85,   2,  81,  38,  65,  76,  52,  22,  41,  24,  50,
        44,  23,  17,  53,  80,  14,  72,  20,  15,  30,  77,  61,  16,
        45,  64,  63,  58,  21,  70,  32,  13,  29,  68,  28,  71,  51,
        60,  82,  73,  62,  26,  36,  35,  87,  86,  39,  57,  78,  12,
        56,  67,  11,  49,  74,  27,  34,  55,  10,   9,   7,  90, 106,
        94,   8,  37,   1,  69,  95,  33,  19,  91, 105,  96, 102,  66,
        43,  93, 100, 104,  98,  54, 103,  92,  99, 101,  25, 107,   3,
        97,  31,  79,  83,  75])

In [10]:
importances


Out[10]:
array([ 0.0507268 ,  0.00698777,  0.00899039,  0.00589762,  0.07038704,
        0.01165051,  0.01060038,  0.00711371,  0.00704125,  0.00713657,
        0.00714236,  0.00729488,  0.00733974,  0.00785317,  0.00830376,
        0.00818293,  0.00811708,  0.00854277,  0.00962529,  0.00684   ,
        0.00829509,  0.00792749,  0.00867889,  0.00856517,  0.00861561,
        0.00626568,  0.00757674,  0.00724175,  0.0077741 ,  0.00782146,
        0.00817295,  0.00554613,  0.00786409,  0.0068422 ,  0.00724155,
        0.00753611,  0.00756835,  0.00703068,  0.0089641 ,  0.007489  ,
        0.00946351,  0.00866461,  0.01030549,  0.00664931,  0.00860624,
        0.00810064,  0.00908484,  0.00929096,  0.00916172,  0.00727822,
        0.00860951,  0.00770365,  0.00871529,  0.00852381,  0.00655207,
        0.00718515,  0.00730696,  0.00745004,  0.00794035,  0.00914338,
        0.00765773,  0.00816501,  0.00758754,  0.00799242,  0.00804953,
        0.00886851,  0.00667043,  0.00730522,  0.00778733,  0.00691678,
        0.00791961,  0.00775537,  0.0082974 ,  0.00762586,  0.00725417,
        0.0018826 ,  0.00879236,  0.00816756,  0.00736289,  0.0050466 ,
        0.00842772,  0.00896538,  0.0076267 ,  0.00406461,  0.00978631,
        0.00903845,  0.00748901,  0.00749685,  0.00981725,  0.06557767,
        0.00708291,  0.00681751,  0.00648293,  0.00660735,  0.00706122,
        0.00690709,  0.00676226,  0.00559408,  0.00657244,  0.00644877,
        0.00660268,  0.00637913,  0.00671568,  0.00654104,  0.00657772,
        0.00679384,  0.00707836,  0.00620518,  0.01084803])

In [13]:
pd.factorize(df['officer_gender'])


Out[13]:
(array([ 0,  1,  0, ..., -1, -1, -1]), Index(['M', 'F'], dtype='object'))

In [14]:
df


Out[14]:
Unnamed: 0 crid officer_id no_affidavit beat investigator_id officer_race officer_gender weekend rank ... ptl ptlths pthsged ptsomeco ptbaplus ptpov pctfb complainant_gender complainant_race complainant_age
0 0 1043963 4603 1 1414 625 Hispanic M 1 SGT ... 45.3 14.6 24.0 18.1 43.3 0.216175 26.5 M Unknown NaN
1 1 1043965 7033 0 1933 833 Black F 1 PO ... 4.8 1.6 3.9 8.2 86.3 0.211413 9.0 M White 53.4
2 2 1043965 6732 0 1933 833 Hispanic M 1 PO ... 4.8 1.6 3.9 8.2 86.3 0.211413 9.0 M White 53.4
3 3 1043965 8650 0 1933 833 White M 1 PO ... 4.8 1.6 3.9 8.2 86.3 0.211413 9.0 M White 53.4
4 4 1043966 0 0 321 1451 NaN NaN 1 UNKNOWN ... 1.0 22.7 24.0 37.6 15.7 0.336253 2.2 NaN NaN NaN
5 5 1043968 0 0 2234 316 NaN NaN 0 UNKNOWN ... 0.3 18.1 31.6 31.0 19.4 0.273673 1.4 M Black 29.1
6 6 1043975 8434 0 733 333 Black M 1 PO ... 0.0 25.7 36.6 35.7 2.0 0.419580 0.0 M Black 38.3
7 7 1043975 3781 0 733 333 Black F 1 PO ... 0.0 25.7 36.6 35.7 2.0 0.419580 0.0 M Black 38.3
8 8 1043977 7226 0 1722 981 White M 1 DET ... 53.8 29.3 30.5 21.1 19.0 0.166779 49.5 F White/Hispanic 40.2
9 9 1043981 5117 1 1133 1278 Asian M 1 PO ... 3.2 36.0 25.4 25.6 13.0 0.606905 0.7 F Black 41.2
10 10 1043981 3995 1 1133 1278 White M 1 SGT ... 3.2 36.0 25.4 25.6 13.0 0.606905 0.7 F Black 41.2
11 11 1043981 187 1 1133 1278 White F 1 PO ... 3.2 36.0 25.4 25.6 13.0 0.606905 0.7 F Black 41.2
12 12 1043985 0 0 1121 562 NaN NaN 1 UNKNOWN ... 2.8 45.7 25.6 23.9 4.8 0.438129 0.4 M Black 48.2
13 13 1043988 7800 1 2211 1148 White M 1 PO ... 6.1 2.8 25.1 35.9 36.2 0.004305 4.5 M White/Hispanic 46.7
14 14 1043988 3388 1 2211 1148 White M 1 PO ... 6.1 2.8 25.1 35.9 36.2 0.004305 4.5 M White/Hispanic 46.7
15 15 1043994 0 0 1414 1060 NaN NaN 1 UNKNOWN ... 66.3 17.1 28.8 23.2 30.9 0.212629 34.9 F White/Hispanic 29.2
16 16 1043995 6489 1 2211 779 Hispanic M 1 PO ... 6.2 3.4 12.4 24.7 59.6 0.022326 3.3 F White 56.1
17 17 1043997 1377 1 531 543 Black M 1 PO ... 24.9 21.1 20.7 23.3 34.8 0.211318 10.7 F Black 33.0
18 18 1044003 0 0 312 336 NaN NaN 1 UNKNOWN ... 0.1 30.4 23.0 33.7 12.9 0.639364 0.1 M Black 25.8
19 19 1044005 8251 1 1723 974 Hispanic F 1 PO ... 49.7 26.5 16.0 20.6 36.9 0.064286 43.3 F White 23.2
20 20 1044005 3456 1 1723 974 White F 1 PO ... 49.7 26.5 16.0 20.6 36.9 0.064286 43.3 F White 23.2
21 21 1044006 1143 0 133 1496 Black M 1 DET ... 2.5 12.1 14.9 33.5 39.5 0.197410 14.8 M White 55.9
22 22 1044008 6768 0 1033 1345 Hispanic M 1 UNKNOWN ... 23.2 47.4 34.2 17.5 0.9 0.360302 11.8 M Black 52.1
23 23 1044008 5767 0 1033 1345 White M 1 SGT ... 23.2 47.4 34.2 17.5 0.9 0.360302 11.8 M Black 52.1
24 24 1044008 1497 0 1033 1345 Black M 1 PO ... 23.2 47.4 34.2 17.5 0.9 0.360302 11.8 M Black 52.1
25 25 1044011 5950 0 813 291 Hispanic M 1 PO ... 73.7 37.3 28.8 20.3 13.6 0.219463 34.2 F White/Hispanic 35.4
26 26 1044013 8398 0 1914 1456 White M 1 PO ... 10.9 12.1 15.5 24.3 48.2 0.311876 21.4 M Black 26.8
27 27 1044013 1510 0 1914 1456 White M 1 PO ... 10.9 12.1 15.5 24.3 48.2 0.311876 21.4 M Black 26.8
28 28 1044016 6516 0 614 1213 Black M 1 PO ... 0.0 10.9 37.0 31.7 20.4 0.153909 0.6 F Black 33.2
29 29 1044016 4339 0 614 1213 Asian M 1 SGT ... 0.0 10.9 37.0 31.7 20.4 0.153909 0.6 F Black 33.2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18691 18691 1073896 5480 0 512 NaN Black M 1 PO ... 1.2 18.5 43.0 32.0 6.5 0.383225 0.4 F Black 33.5
18692 18692 1073923 0 0 1113 NaN NaN NaN 0 UNKNOWN ... 0.0 37.6 37.4 18.7 6.2 0.418713 0.0 NaN NaN NaN
18693 18693 1073926 0 0 1533 NaN NaN NaN 1 UNKNOWN ... 10.5 24.1 33.7 29.5 12.7 0.349110 7.8 M White 24.1
18694 18694 1073962 7677 0 811 NaN White M 1 PO ... 29.5 12.4 28.4 34.1 25.2 0.096608 14.5 M Unknown NaN
18695 18695 1073962 7046 0 811 NaN White M 1 PO ... 29.5 12.4 28.4 34.1 25.2 0.096608 14.5 M Unknown NaN
18696 18696 1073962 5555 0 811 NaN White M 1 PO ... 29.5 12.4 28.4 34.1 25.2 0.096608 14.5 M Unknown NaN
18697 18697 1073971 8105 0 2533 NaN White M 0 PO ... 10.3 25.6 36.6 26.6 11.2 0.247359 3.6 NaN NaN NaN
18698 18698 1073971 7289 0 2533 NaN White M 0 PO ... 10.3 25.6 36.6 26.6 11.2 0.247359 3.6 NaN NaN NaN
18699 18699 1073971 6185 0 2533 NaN Hispanic M 0 PO ... 10.3 25.6 36.6 26.6 11.2 0.247359 3.6 NaN NaN NaN
18700 18700 1073971 472 0 2533 NaN White M 0 PO ... 10.3 25.6 36.6 26.6 11.2 0.247359 3.6 NaN NaN NaN
18701 18701 1074025 0 0 111 1170 NaN NaN 1 UNKNOWN ... 7.2 0.6 3.0 12.9 83.5 0.117722 21.5 F Unknown 65.8
18702 18702 1074028 8182 0 1033 843 Hispanic M 1 PO ... 88.1 60.1 18.3 10.8 10.8 0.366885 51.0 NaN NaN NaN
18703 18703 1074028 2258 0 1033 843 Hispanic M 1 PO ... 88.1 60.1 18.3 10.8 10.8 0.366885 51.0 NaN NaN NaN
18704 18704 1074048 7829 0 2531 NaN White M 1 PO ... 14.6 16.8 40.3 35.3 7.7 0.208085 6.5 NaN NaN NaN
18705 18705 1074048 2202 0 2531 NaN White M 1 PO ... 14.6 16.8 40.3 35.3 7.7 0.208085 6.5 NaN NaN NaN
18706 18706 1074060 1081 0 1513 1141 Hispanic M 0 DET ... 4.9 25.9 29.1 30.1 14.9 0.242463 2.2 NaN NaN NaN
18707 18707 1074090 1218 0 1123 NaN Hispanic M 1 PO ... 2.9 22.9 37.1 23.8 16.2 0.304213 2.4 NaN NaN NaN
18708 18708 1074103 0 0 531 NaN NaN NaN 1 UNKNOWN ... 24.9 21.1 20.7 23.3 34.8 0.211318 10.7 NaN NaN NaN
18709 18709 1074106 5624 0 1222 NaN White M 1 PO ... 1.6 26.5 15.8 37.7 20.1 0.308898 1.9 M White NaN
18710 18710 1074140 0 0 2535 NaN NaN NaN 1 UNKNOWN ... 89.6 44.5 26.0 25.2 4.3 0.232830 22.2 NaN NaN NaN
18711 18711 1074149 1081 0 1513 NaN Hispanic M 1 DET ... 4.9 25.9 29.1 30.1 14.9 0.242463 2.2 NaN NaN NaN
18712 18712 1074154 0 0 1921 NaN NaN NaN 0 UNKNOWN ... 16.3 5.2 15.7 15.5 63.6 0.101330 15.5 F White 52.2
18713 18713 10492785 1983 0 715 NaN White M 1 SGT ... 5.3 31.6 27.8 32.4 8.2 0.342271 3.4 F Black 61.1
18714 18714 10600038 8609 0 223 NaN Black M 0 PO ... 7.8 1.3 5.6 17.1 76.1 0.116818 17.7 NaN NaN NaN
18715 18715 10665987 0 0 723 NaN NaN NaN 1 UNKNOWN ... 0.7 28.6 33.0 31.9 6.5 0.416046 0.0 M Black 33.4
18716 18716 10697121 0 0 1033 NaN NaN NaN 1 UNKNOWN ... 96.2 63.5 19.8 10.3 6.3 0.385191 49.6 M White 43.3
18717 18717 10851356 0 0 132 NaN NaN NaN 1 UNKNOWN ... 5.1 6.5 7.6 20.5 65.4 0.095816 21.3 M Black 52.8
18718 18718 14062218 0 0 1225 NaN NaN NaN 1 UNKNOWN ... 5.2 30.4 12.8 21.2 35.7 0.429297 6.1 F Black 42.2
18719 18719 21068960 0 0 313 NaN NaN NaN 0 UNKNOWN ... 0.0 35.9 23.3 24.6 16.2 0.588768 0.2 M Black NaN
18720 18720 106451911 0 0 1924 NaN NaN NaN 0 UNKNOWN ... 6.2 1.9 5.3 22.6 70.2 0.205599 7.3 M Black 21.2

18721 rows × 117 columns


In [ ]: