In [9]:
import pandas as pd
data_311 = pd.read_csv('df_311_ready_to_join.csv', na_values=['-'])
data_311 = data_311.where((pd.notnull(data_311)), None)
data_311.head(15)


Out[9]:
feature-311 tuple
0 [0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ... (6274149.810962193, -122.42479733333332, 37.77...
1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, ... (68448950.51830366, -122.45177022222221, 37.76...
2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... (180695514.55571115, -122.50571599999999, 37.7...
3 [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ... (198060742.09701943, -122.50571599999999, 37.7...
4 [0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ... (166811428.19983998, -122.42479733333332, 37.7...
5 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... (125240125.90398079, -122.49222955555555, 37.7...
6 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... (164827987.29185838, -122.41131088888888, 37.7...
7 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... (101115007.92118424, -122.42479733333332, 37.7...
8 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... (80592466.28145629, -122.43828377777778, 37.78...
9 [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, ... (168268650.0914183, -122.43828377777778, 37.77...
10 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ... (16110397.579115825, -122.43828377777778, 37.7...
11 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... (194822471.22684535, -122.42479733333332, 37.7...
12 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... (166163774.02580518, -122.41131088888888, 37.7...
13 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ... (135157330.44388878, -122.46525666666668, 37.7...
14 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... (115444356.52170436, -122.39782444444445, 37.7...

In [20]:
len(eval(data_311.iloc[0][0]))


Out[20]:
39

In [10]:
data_911 = pd.read_csv('data_911_ready_to_join.csv', na_values=['-'])
data_911 = data_911.where((pd.notnull(data_911)), None)
data_911.head(15)


Out[10]:
feature-911 tuple
0 5 (68448950.51830366, -122.45177022222221, 37.76...
1 1 (89821538.2614523, -122.47874311111113, 37.710...
2 2 (198060742.09701943, -122.50571599999999, 37.7...
3 8 (166811428.19983998, -122.42479733333332, 37.7...
4 1 (125240125.90398079, -122.49222955555555, 37.7...
5 4 (164827987.29185838, -122.41131088888888, 37.7...
6 1 (69987129.18163633, -122.46525666666668, 37.73...
7 1 (34447106.3814763, -122.42479733333332, 37.710...
8 2 (16110397.579115825, -122.43828377777778, 37.7...
9 3 (194822471.22684535, -122.42479733333332, 37.7...
10 2 (100507832.1330266, -122.43828377777778, 37.78...
11 4 (135157330.44388878, -122.46525666666668, 37.7...
12 1 (52176639.39567914, -122.47874311111113, 37.76...
13 1 (115444356.52170436, -122.39782444444445, 37.7...
14 3 (77718500.88417684, -122.47874311111113, 37.77...

In [24]:
print data_311.head(2)
print len(data_311)
print data_911.head(2)
print len(data_911)


                                         feature-311  \
0  [0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...   
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, ...   

                                               tuple  
0  (6274149.810962193, -122.42479733333332, 37.77...  
1  (68448950.51830366, -122.45177022222221, 37.76...  
249147
   feature-911                                              tuple
0            5  (68448950.51830366, -122.45177022222221, 37.76...
1            1  (89821538.2614523, -122.47874311111113, 37.710...
216286

In [11]:
joined_df = data_311.merge(data_911, how='left', on='tuple')
joined_df = joined_df.where((pd.notnull(joined_df)), 0)
print "done joining"


done joining

In [25]:
joined_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 249147 entries, 0 to 249146
Data columns (total 3 columns):
feature-311    249147 non-null object
tuple          249147 non-null object
feature-911    249147 non-null float64
dtypes: float64(1), object(2)

In [31]:
joined_df.head(15)
print (eval(joined_df.iloc[0][0]))


[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0]

In [14]:
joined_df.to_csv("joined_data.csv", index_label=False)

In [15]:
# use the joined table to create a matrix.  
# The matrix will be Nx(D+1), where N is the number of unique tuples, and D is the number of features in 
# our feature vector.  The last column of the matrix corresponds to the feature-911, i.e. the number
# of 911 reports in the given location/time.
import numpy as np
N = len(joined_df)
D = len(eval(joined_df['feature-311'][0]))
mat = np.zeros([N, D+1])
for i in xrange(N):
    #print i
    mat[i,] = eval(joined_df['feature-311'][i]) + [joined_df['feature-911'][i]]

In [22]:
print len(mat)
print mat[0]


249147
[ 0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  2.  2.  0.  0.
  0.  0.  0.  0.]

In [23]:
np.savetxt("joined_matrix.txt", mat)