In [1]:
# takes in a joined dataframe by tuples and generates a matrix by mapping the rows to feature vectors
DATA_PATH = "joined_unfeaturized.csv"
import pandas as pd
df = pd.read_csv(DATA_PATH, na_values=['-'])
df = df.where((pd.notnull(df)), None)
df.head(2)
Out[1]:
In [2]:
df['feature-311'] = df['feature-311'].apply(lambda x: eval(x)) #convert strings to lists
df['911-reports'] = df['911-reports'].apply(lambda x: eval(x))
In [3]:
df.iloc[1][2]
Out[3]:
In [4]:
# Get all possible labels for certain categorical data, by looping through the data.
Category_set = set()
Request_Type_set = set()
#Request_Details_set = set()
Supervisor_District_set = set()
#Source_set = set()
#ct=0
for row_tuple in df.iterrows():
# ct += 1
# if ct>1:
# break
#print row_tuple[1]
row = row_tuple[1] # a pandas Series
#print row['feature-311']
for dct in row['feature-311']:
# print dct
Category = dct['Category']
# Request_Type = dct['Request Type']
#Request_Details = dct['Request Details']
Supervisor_District = int(dct['Supervisor District'])
#Source = dct['Source']
Category_set.add(Category)
# Request_Type_set.add(Request_Type)
#Request_Details_set.add(Request_Details)
Supervisor_District_set.add(Supervisor_District)
#Source_set.add(Source)
print "done"
In [5]:
print len(Category_set)
#print len(Request_Type_set)
#print len(Request_Details_set)
print len(Supervisor_District_set)
#print len(Source_set)
print Category_set
print Supervisor_District_set
#print Source_set
In [6]:
# Get all possible labels for c911 data, by looping through the data.
Category_set_crime = set()
ct = 0
for row_tuple in df.iterrows():
ct += 1
if ct>200000000000000000:
break
row = row_tuple[1] # a pandas Series
#print row['911-reports']
for dct in row['911-reports']:
# print dct
Category_crime = dct['Category']
# Request_Type = dct['Request Type']
#Request_Details = dct['Request Details']
#Supervisor_District = int(dct['Supervisor District'])
#Source = dct['Source']
Category_set_crime.add(Category_crime)
# Request_Type_set.add(Request_Type)
#Request_Details_set.add(Request_Details)
# Supervisor_District_set.add(Supervisor_District)
#Source_set.add(Source)
print "done"
In [7]:
Category_set_crime
Out[7]:
In [8]:
# Generate a feature vector given a list of dictionaries. Each dictionary represents one 311 report.
# Regularization should also go here
# @param reports_311: a list of dictionaries. Each dictionary is of the form
# {'Category': string, 'Request Details': string, 'Request Type': string, 'Source': string, 'Supervisor District':string,
# 'TimeBin': float, 'XBin':float, 'YBin':float}
#create map dictionaries that will map each one of the categories from the category sets to an index in feature_vector.
category_map = {}
supervisor_map = {}
#source_map = {}
index = 0
for cat in Category_set:
category_map[cat] = index
index += 1
for cat in Supervisor_District_set:
supervisor_map[cat] = index
index += 1
#for cat in Source_set:
# source_map[cat] = index
# index += 1
category_map_crime = {}
index2 = 0
for cat in Category_set_crime:
category_map_crime[cat] = index2
index2 += 1
def generate_feature_vector(reports_311):
# simple feature vector that is just sum of counts of 311 reports of each category
# from (Category, Supervisor District, and Source)
feature_vector = []
for i in xrange(index):
feature_vector.append(0)
for report in reports_311:
#print report
# the first 27 features correspond to the number of 311 reports from each category from Category set
# the next 12 features correspond to the number of 311 reports from each category from Supervisor_District set
# the next 9 features correspond to the number of 311 reports from each category from Source set
feature_vector[category_map[report["Category"]]]+= 1
feature_vector[supervisor_map[report["Supervisor District"]]]+= 1
#feature_vector[source_map[report["Source"]]]+=1
feature_vector.append(len(reports_311))
return feature_vector
def generate_output_vector(reports_911):
output_vector = []
for i in xrange(index2):
output_vector.append(0)
for report in reports_911:
#print report
output_vector[category_map_crime[report["Category"]]]+= 1
output_vector.append(len(reports_911))
return output_vector
print "done"
In [9]:
print category_map
print supervisor_map
In [10]:
count = 0
df_dict = {}
df_dict['vector'] = []
for row_tuple in df.iterrows():
count += 1
if count > 3000000000000000000:
break
row = row_tuple[1] # a pandas Series
dct = (row['feature-311'])
f_in = generate_feature_vector(dct)
dct2 = (row['911-reports'])
f_out = generate_output_vector(dct2)
f_both = f_in + f_out
df_dict['vector'].append(f_both)
print "done"
In [11]:
print index
print index2
print category_map_crime
In [12]:
len(df_dict['vector'][0])
Out[12]:
In [13]:
final_df = pd.DataFrame(df_dict)
len((final_df.iloc[0])[0])
Out[13]:
In [14]:
# use the joined table to create a matrix.
# The matrix will be Nx(D+1), where N is the number of unique tuples, and D is the number of features in
# our feature vector. The last column of the matrix corresponds to the feature-911, i.e. the number
# of 911 reports in the given location/time.
import numpy as np
N = len(final_df)
D = final_df['vector']
#print type(D[i])
mat = np.zeros([N, len(D[0])])
for i in xrange(N):
mat[i,] = D[i]
print len(mat[13])
In [15]:
np.savetxt("joined_matrix_split.txt", mat)
In [16]:
print len(mat)
print mat[0]
print len(mat[0])
In [18]:
intmat = mat.astype(int)
In [19]:
import matplotlib.pyplot as plt
In [54]:
np.bincount(intmat[:,44])
for k in category_map_crime:
i = category_map_crime[k]
print k + ": " + str(np.bincount(intmat[:,40+i]))
In [57]:
#plt.hist((intmat[:,-1]))
#plt.show()
np.bincount(intmat[:,40+37])
Out[57]: