In [1]:
# takes in a joined dataframe by tuples and generates a matrix by mapping the rows to feature vectors
DATA_PATH = "joined_unfeaturized.csv" 
import pandas as pd
df = pd.read_csv(DATA_PATH, na_values=['-'])
df = df.where((pd.notnull(df)), None)
df.head(2)


Out[1]:
feature-311 tuple 911-reports
0 [{'Category': 'Sign Repair', 'TimeBin': 627414... (6274149.810962193, -122.42479733333332, 37.77... []
1 [{'Category': '311 External Request', 'TimeBin... (68448950.51830366, -122.45177022222221, 37.76... [{'Category': 'WARRANTS', 'TimeBin': 68448950....

In [2]:
df['feature-311'] = df['feature-311'].apply(lambda x: eval(x)) #convert strings to lists
df['911-reports'] = df['911-reports'].apply(lambda x: eval(x))

In [3]:
df.iloc[1][2]


Out[3]:
[{'Category': 'WARRANTS',
  'Descript': 'ENROUTE TO OUTSIDE JURISDICTION',
  'PdDistrict': 'PARK',
  'Resolution': 'ARREST, BOOKED',
  'TimeBin': 68448950.51830366,
  'XBin': -122.45177022222221,
  'YBin': 37.76225922222223},
 {'Category': 'WARRANTS',
  'Descript': 'ENROUTE TO OUTSIDE JURISDICTION',
  'PdDistrict': 'PARK',
  'Resolution': 'NONE',
  'TimeBin': 68448950.51830366,
  'XBin': -122.45177022222221,
  'YBin': 37.76225922222223},
 {'Category': 'FRAUD',
  'Descript': 'CREDIT CARD, THEFT BY USE OF',
  'PdDistrict': 'PARK',
  'Resolution': 'NONE',
  'TimeBin': 68448950.51830366,
  'XBin': -122.45177022222221,
  'YBin': 37.76225922222223},
 {'Category': 'LARCENY/THEFT',
  'Descript': 'GRAND THEFT OF PROPERTY',
  'PdDistrict': 'PARK',
  'Resolution': 'NONE',
  'TimeBin': 68448950.51830366,
  'XBin': -122.45177022222221,
  'YBin': 37.76225922222223},
 {'Category': 'ASSAULT',
  'Descript': 'THREATENING PHONE CALL(S)',
  'PdDistrict': 'PARK',
  'Resolution': 'NONE',
  'TimeBin': 68448950.51830366,
  'XBin': -122.45177022222221,
  'YBin': 37.76225922222223}]

In [4]:
# Get all possible labels for certain categorical data, by looping through the data.
Category_set = set()
Request_Type_set = set()
#Request_Details_set = set()
Supervisor_District_set = set()
#Source_set = set()
#ct=0
for row_tuple in df.iterrows():
#    ct += 1
#    if ct>1:
#        break
    #print row_tuple[1]
    row = row_tuple[1] # a pandas Series
    #print row['feature-311']
    for dct in row['feature-311']:
    #    print dct
        Category = dct['Category']
    #    Request_Type = dct['Request Type']
        #Request_Details = dct['Request Details']
        Supervisor_District = int(dct['Supervisor District'])
        #Source = dct['Source']
        Category_set.add(Category)
    #    Request_Type_set.add(Request_Type)
        #Request_Details_set.add(Request_Details)
        Supervisor_District_set.add(Supervisor_District)
        #Source_set.add(Source)
print "done"


done

In [5]:
print len(Category_set)
#print len(Request_Type_set)
#print len(Request_Details_set)
print len(Supervisor_District_set)
#print len(Source_set)
print Category_set
print Supervisor_District_set
#print Source_set


27
12
set(['Streetlights', 'Abandoned Vehicle', 'Interdepartmental Request', 'MUNI Feedback', 'Graffiti Private Property', 'DPW Volunteer Programs', 'Tree Maintenance', 'Rec and Park Requests', 'Residential Building Request', 'Blocked Street or SideWalk', 'Sidewalk or Curb', 'Temporary Sign Request', 'Sewer Issues', '311 External Request', 'Litter Receptacles', 'Catch Basin Maintenance', 'Street Defects', 'Street and Sidewalk Cleaning', 'Color Curb', 'SFHA Requests', 'Construction Zone Permits', 'Graffiti Public Property', 'Illegal Postings', 'Damaged Property', 'Unpermitted Cab Complaint', 'General Requests', 'Sign Repair'])
set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])

In [6]:
# Get all possible labels for c911 data, by looping through the data.
Category_set_crime = set()
ct = 0
for row_tuple in df.iterrows():
    ct += 1
    if ct>200000000000000000:
        break
    row = row_tuple[1] # a pandas Series
    #print row['911-reports']
    for dct in row['911-reports']:
     #   print dct
        Category_crime = dct['Category']
    #    Request_Type = dct['Request Type']
        #Request_Details = dct['Request Details']
        #Supervisor_District = int(dct['Supervisor District'])
        #Source = dct['Source']
        Category_set_crime.add(Category_crime)
    #    Request_Type_set.add(Request_Type)
        #Request_Details_set.add(Request_Details)
    #    Supervisor_District_set.add(Supervisor_District)
        #Source_set.add(Source)
print "done"


done

In [7]:
Category_set_crime


Out[7]:
{'ARSON',
 'ASSAULT',
 'BAD CHECKS',
 'BRIBERY',
 'BURGLARY',
 'DISORDERLY CONDUCT',
 'DRIVING UNDER THE INFLUENCE',
 'DRUG/NARCOTIC',
 'DRUNKENNESS',
 'EMBEZZLEMENT',
 'EXTORTION',
 'FAMILY OFFENSES',
 'FORGERY/COUNTERFEITING',
 'FRAUD',
 'GAMBLING',
 'KIDNAPPING',
 'LARCENY/THEFT',
 'LIQUOR LAWS',
 'LOITERING',
 'MISSING PERSON',
 'NON-CRIMINAL',
 'OTHER OFFENSES',
 'PORNOGRAPHY/OBSCENE MAT',
 'PROSTITUTION',
 'RECOVERED VEHICLE',
 'ROBBERY',
 'RUNAWAY',
 'SEX OFFENSES, FORCIBLE',
 'SEX OFFENSES, NON FORCIBLE',
 'STOLEN PROPERTY',
 'SUICIDE',
 'SUSPICIOUS OCC',
 'TRESPASS',
 'VANDALISM',
 'VEHICLE THEFT',
 'WARRANTS',
 'WEAPON LAWS'}

In [8]:
# Generate a feature vector given a list of dictionaries.  Each dictionary represents one 311 report.
# Regularization should also go here
    # @param reports_311: a list of dictionaries.  Each dictionary is of the form
    # {'Category': string, 'Request Details': string, 'Request Type': string, 'Source': string, 'Supervisor District':string,
    #    'TimeBin': float, 'XBin':float, 'YBin':float}

#create map dictionaries that will map each one of the categories from the category sets to an index in feature_vector.
category_map = {}
supervisor_map = {}
#source_map = {}
index = 0
for cat in Category_set:
    category_map[cat] = index
    index += 1 
for cat in Supervisor_District_set:
    supervisor_map[cat] = index
    index += 1
#for cat in Source_set:
#    source_map[cat] = index
#    index += 1
category_map_crime = {}
index2 = 0
for cat in Category_set_crime:
    category_map_crime[cat] = index2
    index2 += 1
    
def generate_feature_vector(reports_311):
    # simple feature vector that is just sum of counts of 311 reports of each category 
    # from (Category, Supervisor District, and Source)
    feature_vector = []
    for i in xrange(index):
        feature_vector.append(0)
    for report in reports_311:
        #print report
        # the first 27 features correspond to the number of 311 reports from each category from Category set
        # the next 12 features correspond to the number of 311 reports from each category from Supervisor_District set
        # the next 9 features correspond to the number of 311 reports from each category from Source set
        feature_vector[category_map[report["Category"]]]+= 1
        feature_vector[supervisor_map[report["Supervisor District"]]]+= 1
        #feature_vector[source_map[report["Source"]]]+=1
    feature_vector.append(len(reports_311))
    return feature_vector

def generate_output_vector(reports_911):
    output_vector = []
    for i in xrange(index2):
        output_vector.append(0)
    for report in reports_911:
        #print report
        output_vector[category_map_crime[report["Category"]]]+= 1
    output_vector.append(len(reports_911))
    return output_vector
print "done"


done

In [9]:
print category_map
print supervisor_map


{'Streetlights': 0, 'Abandoned Vehicle': 1, 'Interdepartmental Request': 2, 'MUNI Feedback': 3, 'Graffiti Private Property': 4, 'DPW Volunteer Programs': 5, 'Tree Maintenance': 6, 'Construction Zone Permits': 20, 'Residential Building Request': 8, 'Blocked Street or SideWalk': 9, 'Sidewalk or Curb': 10, 'Temporary Sign Request': 11, 'Sewer Issues': 12, '311 External Request': 13, 'Litter Receptacles': 14, 'Catch Basin Maintenance': 15, 'Street Defects': 16, 'Street and Sidewalk Cleaning': 17, 'Color Curb': 18, 'SFHA Requests': 19, 'Rec and Park Requests': 7, 'Graffiti Public Property': 21, 'Illegal Postings': 22, 'Damaged Property': 23, 'Unpermitted Cab Complaint': 24, 'General Requests': 25, 'Sign Repair': 26}
{0: 27, 1: 28, 2: 29, 3: 30, 4: 31, 5: 32, 6: 33, 7: 34, 8: 35, 9: 36, 10: 37, 11: 38}

In [10]:
count = 0
df_dict = {}
df_dict['vector'] = []

for row_tuple in df.iterrows():
    count += 1
    if count > 3000000000000000000: 
        break
    row = row_tuple[1] # a pandas Series
    dct = (row['feature-311'])
    f_in = generate_feature_vector(dct)
    dct2 = (row['911-reports'])
    f_out = generate_output_vector(dct2)
    f_both = f_in + f_out
    df_dict['vector'].append(f_both)
print "done"


done

In [11]:
print index
print index2
print category_map_crime


39
37
{'KIDNAPPING': 0, 'WEAPON LAWS': 1, 'WARRANTS': 2, 'PROSTITUTION': 3, 'EMBEZZLEMENT': 4, 'SEX OFFENSES, NON FORCIBLE': 5, 'PORNOGRAPHY/OBSCENE MAT': 29, 'FRAUD': 20, 'DRIVING UNDER THE INFLUENCE': 8, 'ROBBERY': 9, 'BURGLARY': 10, 'STOLEN PROPERTY': 32, 'SUSPICIOUS OCC': 11, 'ARSON': 17, 'ASSAULT': 33, 'FORGERY/COUNTERFEITING': 14, 'BAD CHECKS': 15, 'DRUNKENNESS': 16, 'GAMBLING': 18, 'OTHER OFFENSES': 19, 'SUICIDE': 7, 'RECOVERED VEHICLE': 21, 'SEX OFFENSES, FORCIBLE': 22, 'DRUG/NARCOTIC': 23, 'TRESPASS': 24, 'LOITERING': 6, 'VANDALISM': 26, 'MISSING PERSON': 34, 'LIQUOR LAWS': 30, 'VEHICLE THEFT': 31, 'EXTORTION': 28, 'BRIBERY': 13, 'FAMILY OFFENSES': 12, 'NON-CRIMINAL': 27, 'DISORDERLY CONDUCT': 35, 'RUNAWAY': 36, 'LARCENY/THEFT': 25}

In [12]:
len(df_dict['vector'][0])


Out[12]:
78

In [13]:
final_df = pd.DataFrame(df_dict)
len((final_df.iloc[0])[0])


Out[13]:
78

In [14]:
# use the joined table to create a matrix.  
# The matrix will be Nx(D+1), where N is the number of unique tuples, and D is the number of features in 
# our feature vector.  The last column of the matrix corresponds to the feature-911, i.e. the number
# of 911 reports in the given location/time.
import numpy as np
N = len(final_df)
D = final_df['vector']
#print type(D[i])
mat = np.zeros([N, len(D[0])])
for i in xrange(N):
    mat[i,] = D[i]
print len(mat[13])


78

In [15]:
np.savetxt("joined_matrix_split.txt", mat)

In [16]:
print len(mat)
print mat[0]
print len(mat[0])


249147
[ 0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  2.  2.  0.  0.
  0.  0.  0.  4.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.]
78

In [18]:
intmat = mat.astype(int)

In [19]:
import matplotlib.pyplot as plt

In [54]:
np.bincount(intmat[:,44])
for k in category_map_crime:
    i = category_map_crime[k]
    print k + ": " + str(np.bincount(intmat[:,40+i]))


KIDNAPPING: [247347   1701     97      2]
WEAPON LAWS: [244746   3757    556     69     16      1      1      1]
WARRANTS: [226669  15941   4394   1413    510    147     43     22      3      2
      1      2]
PROSTITUTION: [247627    868    356    139     67     27     22     14     12      3
      4      5      2      1]
EMBEZZLEMENT: [248454    678     11      3      1]
SEX OFFENSES, NON FORCIBLE: [249050     97]
PORNOGRAPHY/OBSCENE MAT: [249135     12]
FRAUD: [238377   9613    976    144     33      4]
DRIVING UNDER THE INFLUENCE: [247403   1658     78      8]
ROBBERY: [234573  12531   1702    274     55     11      1]
BURGLARY: [227944  18427   2319    359     78     18      0      2]
STOLEN PROPERTY: [248357    756     30      4]
SUSPICIOUS OCC: [233142  14541   1301    144     15      4]
ARSON: [248298    820     25      3      0      1]
ASSAULT: [214942  23795   7100   2213    740    241     77     28      6      3
      1      0      1]
FORGERY/COUNTERFEITING: [245031   3708    354     44      7      2      0      0      1]
BAD CHECKS: [248944    201      2]
DRUNKENNESS: [245925   3030    171     16      2      1      1      1]
GAMBLING: [249074     67      6]
OTHER OFFENSES: [197798  32945  10349   4227   1909    997    448    222    126     50
     27     31      5      3      3      1      2      1      2      0
      0      0      1]
SUICIDE: [248790    355      1      1]
RECOVERED VEHICLE: [246531   2555     59      1      0      0      0      0      0      0
      1]
SEX OFFENSES, FORCIBLE: [246649   2272    195     28      3]
DRUG/NARCOTIC: [230431   8809   5022   2189   1094    599    400    230    134     85
     62     41     20     18      6      1      3      0      3]
TRESPASS: [244651   4233    244     14      3      2]
LOITERING: [248619    480     41      5      2]
VANDALISM: [222753  22503   3213    537    113     17      3      4      1      1
      1      0      0      0      0      0      1]
MISSING PERSON: [236519   5966   5835    438    320     27     29      5      4      2
      2]
LIQUOR LAWS: [247930   1146     59     10      2]
VEHICLE THEFT: [227792  18391   2606    309     41      6      0      0      1      0
      1]
EXTORTION: [248932    206      8      1]
BRIBERY: [249085     62]
FAMILY OFFENSES: [248936    201      8      2]
NON-CRIMINAL: [202958  32799   8464   3015   1182    461    167     64     22      8
      5      1      1]
DISORDERLY CONDUCT: [246110   2676    317     42      2]
RUNAWAY: [247836   1235     71      4      0      1]
LARCENY/THEFT: [179767  40762  14188   6184   3148   1768   1058    749    521    352
    220    156    104     66     42     25     16      7      4      3
      3      3      0      0      1]

In [57]:
#plt.hist((intmat[:,-1]))
#plt.show()
np.bincount(intmat[:,40+37])


Out[57]:
array([101219,  48819,  30121,  19087,  12030,   7973,   5536,   3911,
         3033,   2305,   1956,   1711,   1465,   1362,   1224,   1034,
          890,    830,    700,    651,    593,    472,    394,    332,
          260,    230,    212,    177,    145,    111,     80,     61,
           42,     41,     33,     28,     13,     18,     10,      9,
            7,      7,      2,      4,      4,      0,      3,      2])