In [1]:
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
infile = "small_sample.csv"
bf_outfile = "both_fields.csv"
full_outfile = "full_outfile.csv"
final_outfile = "final_outfile.csv"
last_infile = "final_outfile.csv"
In [3]:
df = pd.read_csv(infile)
df.head()
Out[3]:
In [4]:
row_map = df.groupby(["City", "Country"])
len(row_map.groups)
Out[4]:
In [5]:
both_fields = df.dropna()
both_fields.head()
Out[5]:
In [6]:
bf_groups = both_fields.groupby(["City", "Country"])
len(bf_groups.groups)
Out[6]:
In [7]:
with open(bf_outfile, "w") as f:
writer = csv.writer(f)
writer.writerow(["Id", "City", "Country"])
for i, (city, country) in enumerate(bf_groups.groups.keys()):
writer.writerow([i, city, country])
In [8]:
!python csv_dedupe.py -v
In [8]:
bf_output = pd.read_csv("both_fields_output.csv").fillna("")
bf_output.head()
Out[8]:
In [9]:
# bf_output["confidence_score"].mean()
In [10]:
bf_output_groups = bf_output.groupby(["canonical_Country", "canonical_City"])
bf_output_groups.groups.keys()
Out[10]:
In [11]:
new_df = df.fillna("")
row_map = new_df.groupby(["City", "Country"])
len(row_map.groups)
Out[11]:
In [12]:
def map_merge(row_map, canonicals):
for row in canonicals.iterrows():
row = row[1]
can_key = (row["canonical_City"], row["canonical_Country"])
if can_key[0] or can_key[1]:
# print(can_key[0], can_key[1])
key = (row["City"], row["Country"])
row_ids = row_map[key]
del row_map[key]
row_map.setdefault(can_key, [])
row_map[can_key] += row_ids
return row_map
In [13]:
new_map = map_merge(row_map.groups, bf_output)
In [14]:
len(new_map.keys())
Out[14]:
Again
In [15]:
with open(full_outfile, "w") as f:
writer = csv.writer(f)
writer.writerow(["Id", "City", "Country"])
for i, (city, country) in enumerate(new_map.keys()):
writer.writerow([i, city, country])
In [16]:
full_output = pd.read_csv("full_output.csv").fillna("")
In [17]:
full_groups = full_output.groupby(["canonical_Country", "canonical_City"])
len(full_groups)
Out[17]:
In [18]:
newer_map = map_merge(new_map, full_output)
In [19]:
len(newer_map)
Out[19]:
In [20]:
with open(final_outfile, "w") as f:
writer = csv.writer(f)
writer.writerow(["Id", "City", "Country", "Slug"]) # open refine slug
for i, (city, country) in enumerate(newer_map.keys()):
writer.writerow([i, city, country, ",".join([city, country])])
In [21]:
last_df = pd.read_csv(last_infile).fillna("")
In [22]:
def slugify_map(row_map, canonicals):
for row in canonicals.iterrows():
row = row[1]
slug = row["Slug"]
key = (row["City"], row["Country"])
row_ids = row_map[key]
del row_map[key]
row_map.setdefault(slug, [])
row_map[slug] += row_ids
return row_map
In [23]:
last_map = slugify_map(newer_map, last_df)
In [31]:
df["slug"] = ""
df["canonical_city"] = ""
df["canonical_country"] = ""
df["canonical_country"][0]
Out[31]:
In [37]:
def assign_slugs(df, slugs):
for k, v in slugs.items():
for i in v:
df["slug"].iat[i] = k
city, country = k.split(",")
df["canonical_city"].iat[i] = city
df["canonical_country"].iat[i] = country
return df
In [38]:
slug_df = assign_slugs(df, last_map)
In [39]:
slug_df
Out[39]:
In [ ]: