In [ ]:
%matplotlib inline
import pandas
import matplotlib.pyplot
In [ ]:
truth_df = pandas.DataFrame.from_csv("raw_data/truth.csv", index_col=None)
# Generate "Dates" from "Date" and "Time"
truth_df["Dates"] = pandas.to_datetime(truth_df["Date"] + " " + truth_df["Time"])
# Remove commas from Category field, because they are removed from the train and submission sets
truth_df["Category"] = truth_df["Category"].apply(lambda x: x.replace(",", ""))
train_df = pandas.DataFrame.from_csv("raw_data/train.csv", index_col=None)
train_df["Dates"] = pandas.to_datetime(train_df["Dates"])
test_df = pandas.DataFrame.from_csv("raw_data/test.csv", index_col=None)
test_df["Dates"] = pandas.to_datetime(test_df["Dates"])
In [ ]:
# These are the columns from the training dataset:
# Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
# Match with training set to make sure everything is in order.
# Watch out for X and Y, which have precision differences between truth and train/test sets.
fig, ax = matplotlib.pyplot.subplots(1,3)
ax[0].scatter(train_df["X"], train_df["Y"])
ax[1].scatter(truth_df["X"], truth_df["Y"])
ax[2].scatter(test_df["X"], test_df["Y"])
In [ ]:
# Those entries at the North Pole are missing data.
# They do appear in the test set, which kills me.
# Here are what their addresses look like
invalid_train_points = (train_df["Y"] == 90)
invalid_truth_points = (truth_df["Y"] == 90)
invalid_test_points = (test_df["Y"] == 90)
pandas.concat([train_df.loc[invalid_train_points, "Address"],
truth_df.loc[invalid_truth_points, "Address"],
test_df.loc[invalid_test_points, "Address"]]).unique()
In [ ]:
# It turns out that most of these problems are because the city's grid system
# is busted up by Market St, and the streets on the west side of Market have
# a different name from the connecting streets on the east side.
#
# And some times, it's a similar problem, only on the grid system, where one
# road suddenly changes names.
#
# And some times, the streets are input wrong, so they never intersect.
#
# These solutions come from looking up the intersections in Google manually.
# Sometimes I had to guess as to what the real streats were that they were talking about.
bad_addresses = ['7THSTNORTH ST / MCALLISTER ST',
'GEARY BL / AVENUE OF THE PALMS',
'EDDY ST / 5THSTNORTH ST',
'MCALLISTER ST / 7THSTNORTH ST',
'FRONT ST / THE EMBARCADERONORTH ST',
'OFARRELL ST / 5THSTNORTH ST',
'5TH ST / 5THSTNORTH ST',
'STJOSEPHS AV / TERRAVISTA AV',
'AUSTIN ST / LARKIN ST',
'ELLIS ST / 5THSTNORTH ST',
'5THSTNORTH ST / EDDY ST',
'JENNINGS CT / PHELPS ST',
'INTERSTATE280 HY / GENEVA AV',
'ELLICK LN / CALIFORNIA ST',
'BUSH ST / STGEORGE AL',
'INTERSTATE280 HY / OCEAN AV',
'SPEAR ST / THE EMBARCADEROSOUTH ST',
'BRANNAN ST / 1ST ST',
'ARGUELLO BL / NORTHRIDGE DR',
'17TH ST / COLLINGWOOD ST',
'JEWETT ST / 5TH ST',
'JAMES LICK FREEWAY HY / CESAR CHAVEZ ST',
'I-280 / CESAR CHAVEZ ST',
'JAMES LICK FREEWAY HY / 4TH ST',
'JAMES LICK FREEWAY HY / SILVER AV',
'1ST ST / BRYANT ST',
'LARKIN ST / AUSTIN ST',
'JOHN F KENNEDY DR / CROSSOVER DR',
'EUCLID AV / AVENUE OF THE PALMS',
'GILMAN AV / FITCH ST',
'1ST ST / BRANNAN ST',
'JAMES LICK FREEWAY HY / BAY SHORE BL',
'SPEAR ST / THE EMBARCADERO SOUTH ST',
'LOWER GREAT HY / MARTIN LUTHER KING JR DR',
'5THSTNORTH ST / OFARRELL ST',
'JAMESLICKFREEWAY HY / SILVER AV',
'5THSTNORTH ST / ELLIS ST',
'YOSEMITE AV / WILLIAMS AV',
'BRENHAM PL / WASHINGTON ST',
'AVENUE OF THE PALMS / GEARY BL',
'STCHARLES AV / 19TH AV',
'TURK ST / STJOSEPHS AV',
'MONTGOMERY ST / THE EMBARCADERONORTH ST',
'FITCH ST / DONNER AV',
'AVENUE OF THE PALMS / EUCLID AV',
'VANNESS AV / BEACH ST',
'PERSIA AV / LAGRANDE AV',
'3RD ST / ISLAISCREEK ST',
'JENNINGS CT / INGALLS ST',
'GENEVA AV / INTERSTATE280 HY',
'CHARLES J BRENHAM PL / CLAY ST',
'STELMO WY / MONTEREY BL',
'I-280 / PENNSYLVANIA AV',
'FLORIDA ST / ALAMEDA ST',
'BRYANT ST / SPEAR ST',
'PERSIA AV / LA GRANDE AV',
'GREENWICH ST / THE EMBARCADERO NORTH ST',
'JOHN F KENNEDY DR / MARTIN LUTHER KING JR DR',
'3RD ST / JAMES LICK FREEWAY HY'
]
solution_latlon = [(37.781002, -122.413187), # 7th -> Charles J. Brenham Pl
(37.781391, -122.457743), # Avenue of the Palms -> Palm Ave (Avenue of the Palms is on Treasure Island)
(37.784466, -122.408574), # 5th -> Cyril Magnin St
(37.781002, -122.413187), # 7th -> Charles J. Brenham Pl
(37.801544, -122.400441), # The Embarcadero -> Union St (Front St ends before intersecting TE)
(37.786336, -122.408949), # 5th -> Cyril Magnin St
(37.784010, -122.408097), # Best guess: Market St?
(37.781236, -122.441349), # Terra Vista Ave -> Ellis St
(37.789378, -122.418782), # Austin St -> Frank Norris St
(37.785413, -122.408753), # 5th -> Cyril Magnin St
(37.784466, -122.408574), # 5th -> Cyril Magnin St
(37.717177, -122.397460), # Jennings Ct is nowhere near Phelps Ave, and Jennings St is parallel to Phelps?
(37.721288, -122.448127), # No changes
(37.793165, -122.400530), # Ellick Ln doesn't exist? This is in the middle of California St by Halleck St
(37.790727, -122.404352), # No changes
(37.722981, -122.447773), # No changes
(37.787690, -122.387976), # Spear St is a pedestrian walkway at the intersection
(37.783652, -122.389865), # 1st -> Delancey St
(37.773227, -122.459070), # No idea. Northridge Dr doesn't exist. I put this in the park?
(37.762138, -122.436218), # 17th -> Market St (Collingwood ends at Market)
(37.784005, -122.408090), # No idea. Jewett is not actually a street. I put this at Market?
(37.749387, -122.403732), # No changes
(37.750103, -122.391998), # No changes
(37.780262, -122.398896), # No changes
(37.732664, -122.405247), # No changes
(37.784950, -122.391289), # Approximate (1st ends before intersecting Bryant)
(37.789378, -122.418782), # Austin St -> Frank Norris St
(37.770716, -122.479205), # No changes
(37.783870, -122.457922), # Avenue of the Palms -> Palm Ave (Avenue of the Palms is on Treasure Island)
(37.717177, -122.386260), # No changes
(37.783652, -122.389865), # 1st -> Delancey St
(37.715933, -122.398576), # No changes
(37.787690, -122.387976), # Spear St is a pedestrian walkway at the intersection
(37.764097, -122.509411), # Great Hwy -> La Playa St
(37.786336, -122.408949), # 5th -> Cyril Magnin St
(37.732664, -122.405247), # No changes
(37.785413, -122.408753), # 5th -> Cyril Magnin St
(37.729587, -122.394912), # Yosemite -> Mendell
(37.793917, -122.416326), # No idea (Brenham does not intersect Washington. Brenham -> Leavenworth?)
(37.781391, -122.457743), # Avenue of the Palms -> Palm Ave (Avenue of the Palms is on Treasure Island)
(37.713785, -122.469331), # St. Chalres becomes 19th
(37.779396, -122.440936), # No changes
(37.806569, -122.405870), # Montgomery becomes a pedestrian walkway at the intersection
(37.719108, -122.384525), # Approximate (Donner ends before Fitch)
(37.783870, -122.457922), # Avenue of the Palms -> Palm Ave (Avenue of the Palms is on Treasure Island)
(37.806036, -122.425514), # Beech ends before Van Ness
(37.718562, -122.426440), # LaGrande -> Dublin St
(37.747707, -122.387248), # Best guess for Islias Creek St?
(37.734113, -122.379523), # Jennings -> Middlepoint Rd (unclear)?
(37.721288, -122.448127), # No changes
(37.793043, -122.416116), # Brenham -> Leavenworth?
(37.730460, -122.460551), # No changes
(37.757690, -122.392347), # No changes
(37.768121, -122.411724), # Approximate (Alameda ends before Florida)
(37.787690, -122.387976), # Spear St is a pedestrian walkway at the intersection
(37.718562, -122.426440), # LaGrande -> Dublin St
(37.803124, -122.401270), # In a shopping mall
(37.766686, -122.506270), # Martin Luther King Jr -> Bernice Rodgers Way
(37.781978, -122.396687) # Very tricky
]
for address, latlon in zip(bad_addresses, solution_latlon):
train_df.loc[invalid_train_points & (train_df["Address"] == address), ["Y", "X"]] = latlon
truth_df.loc[invalid_truth_points & (truth_df["Address"] == address), ["Y", "X"]] = latlon
test_df.loc[invalid_test_points & (test_df["Address"] == address), ["Y", "X"]] = latlon
In [ ]:
fig, ax = matplotlib.pyplot.subplots(1,3)
ax[0].scatter(train_df["X"], train_df["Y"])
ax[1].scatter(truth_df["X"], truth_df["Y"])
ax[2].scatter(test_df["X"], test_df["Y"])
In [ ]:
# Before matching, round X and Y to 6 decimal places to avoid FP errors.
# Note: 6 decimal places lon and lat at this latitude corresponds to ~ 14 cm
for df in [train_df, truth_df, test_df]:
df[["X", "Y"]] = df[["X", "Y"]].apply(lambda x: x.round(6), axis="index")
In [ ]:
# There might be differences in address and how the crime was resolved, but not in X or Y anymore
merged_train_df = train_df.merge(truth_df,
on=[col for col in train_df.columns.tolist() if col not in ["Address", "Resolution"]],
how="left")
# 145253 simply is not in the truth set. Unknown why.
merged_train_df[merged_train_df["PdId"].isnull()]
In [ ]:
# Match with the test set to label them with the truth
merged_test_df = test_df.merge(truth_df,
on=[col for col in test_df.columns.tolist() if col not in ["Address", "Id"]],
how="left",
)
merged_test_df[merged_test_df["PdId"].isnull()]
In [ ]:
# Anyway, save cleaned data to use for future training and testing
train_df.to_csv("processed_data/train_clean.csv", index=None)
test_df.to_csv("processed_data/test_clean.csv", index="Id")
truth_df.to_csv("processed_data/truth_clean.csv", index=None, columns=train_df.columns)
In [ ]:
# Finally, produce an (almost) correct key for the test set
# Note: the ones I can't figure out I am leaving completely blank.
key = pandas.get_dummies(merged_test_df[["Id", "Category"]], columns=["Category"], prefix=[""], prefix_sep="")
key = key.groupby("Id").mean()
key.head(10)
In [ ]:
key.to_csv("processed_data/key.csv", index="Id")