In [1]:
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
infile = "small_sample.csv"
bf_outfile = "both_fields.csv"
full_outfile = "full_outfile.csv"
final_outfile = "final_outfile.csv"
last_infile = "final_outfile.csv"

In [3]:
df = pd.read_csv(infile)
df.head()


Out[3]:
Id City Country
0 0 Madrid España
1 1 Bogotá NaN
2 2 Santiago Chile
3 3 Ponta Delgada NaN
4 4 Buenos Aires Argentina

In [4]:
row_map = df.groupby(["City", "Country"])
len(row_map.groups)


Out[4]:
250

In [5]:
both_fields = df.dropna()
both_fields.head()


Out[5]:
Id City Country
0 0 Madrid España
2 2 Santiago Chile
4 4 Buenos Aires Argentina
5 5 Buenos Aires Argentina
7 7 Rio Piedras P.R.

In [6]:
bf_groups = both_fields.groupby(["City", "Country"])
len(bf_groups.groups)


Out[6]:
104

In [7]:
with open(bf_outfile, "w") as f:
    writer = csv.writer(f)
    writer.writerow(["Id", "City", "Country"])
    for i, (city, country) in enumerate(bf_groups.groups.keys()):
        writer.writerow([i, city, country])

In [8]:
!python csv_dedupe.py -v


INFO:root:Generating grammar tables from /usr/lib/python3.4/lib2to3/Grammar.txt
INFO:root:Generating grammar tables from /usr/lib/python3.4/lib2to3/PatternGrammar.txt
input file:  full_outfile.csv
importing data ...
reading from settings/full_learned_settings
INFO:dedupe.api:Learned Weights
INFO:dedupe.api:('(City: String)', -2.2533092198583144)
INFO:dedupe.api:('(Country: String)', -0.06505061016340563)
INFO:dedupe.api:('((City: String): Not Missing)', 1.2548415818237795)
INFO:dedupe.api:('((Country: String): Not Missing)', -0.12959618257781477)
INFO:dedupe.api:('bias', 1.1142048722686542)
INFO:dedupe.api:(CompoundPredicate: (SimplePredicate: (firstTokenPredicate, City), TfidfNGramCanopyPredicate: (0.4, City)), CompoundPredicate: (SimplePredicate: (sortedAcronym, City), TfidfNGramCanopyPredicate: (0.4, City)))
INFO:dedupe.api:{'City': {'TfidfNGramCanopyPredicate': set()}}
blocking...
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.4, City)
INFO:dedupe.api:0 blocks
INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 0.984
INFO:dedupe.api:precision: 0.672
INFO:dedupe.api:With threshold: 0.221
clustering...
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.4, City)
INFO:dedupe.api:0 blocks
# duplicate sets 40
finding canonical
writing file...:  full_output.csv

In [8]:
bf_output = pd.read_csv("both_fields_output.csv").fillna("")
bf_output.head()


Out[8]:
Cluster ID confidence_score Id City Country canonical_City canonical_Id canonical_Country
0 20 0 Veracruz México
1 1 0.925403 1 Managua Nicaragua managua 1 nicaragua
2 21 2 Puebla Méx
3 2 0.9615852 3 Barcelona España barcelona 3 espana
4 22 4 Jiquilpan México

In [9]:
# bf_output["confidence_score"].mean()

In [10]:
bf_output_groups = bf_output.groupby(["canonical_Country", "canonical_City"])
bf_output_groups.groups.keys()


Out[10]:
dict_keys([('etc', 'san salvador'), ('republica dominicana', 'santiago'), ('espana', 'barcelona'), ('colombia', 'bogota'), ('republica dominicana', 'santo domingo'), ('d.f', 'mexico'), ('venezuela', 'merida'), ('guatemala', 'guatemala'), ('nicaragua', 'managua'), ('estado de mexico', 'toluca'), ('brasil', 'sao paulo'), ('barcelona', 'esplugues de llobregat'), ('', ''), ('rep. de p', 'panama'), ('peru', 'lima'), ('brasil', 'rio de janeiro'), ('espana', 'madrid'), ('brazil', 'brasilia'), ('p.r.', 'rio piedras'), ('costa rica', 'san jose'), ('rep. oriental del uruguay', 'montevideo')])

In [11]:
new_df = df.fillna("")
row_map = new_df.groupby(["City", "Country"])
len(row_map.groups)


Out[11]:
250

In [12]:
def map_merge(row_map, canonicals):
    for row in canonicals.iterrows():
        row = row[1]
        can_key = (row["canonical_City"], row["canonical_Country"])
        if can_key[0] or can_key[1]:
#             print(can_key[0], can_key[1])
            key = (row["City"], row["Country"])
            row_ids = row_map[key]
            del row_map[key]
            row_map.setdefault(can_key, [])
            row_map[can_key] += row_ids
    return row_map

In [13]:
new_map = map_merge(row_map.groups, bf_output)

In [14]:
len(new_map.keys())


Out[14]:
224

Again


In [15]:
with open(full_outfile, "w") as f:
    writer = csv.writer(f)
    writer.writerow(["Id", "City", "Country"])
    for i, (city, country) in enumerate(new_map.keys()):
        writer.writerow([i, city, country])

In [16]:
full_output = pd.read_csv("full_output.csv").fillna("")

In [17]:
full_groups = full_output.groupby(["canonical_Country", "canonical_City"])
len(full_groups)


Out[17]:
41

In [18]:
newer_map = map_merge(new_map, full_output)

In [19]:
len(newer_map)


Out[19]:
173

In [20]:
with open(final_outfile, "w") as f:
    writer = csv.writer(f)
    writer.writerow(["Id", "City", "Country", "Slug"]) # open refine slug
    for i, (city, country) in enumerate(newer_map.keys()):
        writer.writerow([i, city, country, ",".join([city, country])])

In [21]:
last_df = pd.read_csv(last_infile).fillna("")

In [22]:
def slugify_map(row_map, canonicals):
    for row in canonicals.iterrows():
        row = row[1]
        slug = row["Slug"]
        key = (row["City"], row["Country"])
        row_ids = row_map[key]
        del row_map[key]
        row_map.setdefault(slug, [])
        row_map[slug] += row_ids
    return row_map

In [23]:
last_map = slugify_map(newer_map, last_df)

In [31]:
df["slug"] = ""
df["canonical_city"] = ""
df["canonical_country"] = ""
df["canonical_country"][0]


Out[31]:
''

In [37]:
def assign_slugs(df, slugs):
    for k, v in slugs.items():
        for i in v:
            df["slug"].iat[i] = k
            city, country = k.split(",")
            df["canonical_city"].iat[i] = city
            df["canonical_country"].iat[i] = country
    return df

In [38]:
slug_df = assign_slugs(df, last_map)

In [39]:
slug_df


Out[39]:
Id City Country slug canonical_city canonical_country
0 0 Madrid España madrid,espana madrid espana
1 1 Bogotá NaN bogota,colombia bogota colombia
2 2 Santiago Chile santiago,republica dominicana santiago republica dominicana
3 3 Ponta Delgada NaN Ponta Delgada, Ponta Delgada
4 4 Buenos Aires Argentina Buenos Aires, Argentina Buenos Aires Argentina
5 5 Buenos Aires Argentina Buenos Aires, Argentina Buenos Aires Argentina
6 6 Leon NaN Leon, Leon
7 7 Rio Piedras P.R. rio piedras,p.r. rio piedras p.r.
8 8 Rio de Janeiro NaN rio de janeiro,rj rio de janeiro rj
9 9 New York NY New York, NY New York NY
10 10 Buenos Aires Argentina Buenos Aires, Argentina Buenos Aires Argentina
11 11 Madrid España madrid,espana madrid espana
12 12 Barcelona España barcelona,espana barcelona espana
13 13 Curitiba NaN Curitiba, Curitiba
14 14 Tegucigalpa NaN tegucigalpa,hondura tegucigalpa hondura
15 15 Guatemala Guatemala gautemala,guatemala gautemala guatemala
16 16 Rio de Janeiro NaN rio de janeiro,rj rio de janeiro rj
17 17 Madrid España madrid,espana madrid espana
18 18 México NaN mexico,d.f mexico d.f
19 19 Xalapa México Xalapa, México Xalapa México
20 20 La Habana NaN la habana,cuba la habana cuba
21 21 San Juan de Puerto Rico NaN san juan,p.r san juan p.r
22 22 Lima NaN lima,peru lima peru
23 23 Madrid España madrid,espana madrid espana
24 24 Madrid España madrid,espana madrid espana
25 25 Rio de Janeiro NaN rio de janeiro,rj rio de janeiro rj
26 26 Madrid España madrid,espana madrid espana
27 27 New York NY New York, NY New York NY
28 28 México D.F mexico,d.f mexico d.f
29 29 Madrid España madrid,espana madrid espana
... ... ... ... ... ... ...
969 969 Brasília NaN brasilia,brazil brasilia brazil
970 970 México NaN mexico,d.f mexico d.f
971 971 Eden Prairie MN Eden Prairie, MN Eden Prairie MN
972 972 Barcelona España barcelona,espana barcelona espana
973 973 Mexico D.F mexico,d.f mexico d.f
974 974 Santo Domingo República Dominicana santo domingo,republica dominicana santo domingo republica dominicana
975 975 La Habana Cuba la habana,cuba la habana cuba
976 976 Porto NaN porto alegre,rio grande do sul porto alegre rio grande do sul
977 977 Lima NaN lima,peru lima peru
978 978 Rio de Janeiro NaN rio de janeiro,rj rio de janeiro rj
979 979 Toluca Mex toluca,estado de mexico toluca estado de mexico
980 980 Bogotá NaN bogota,colombia bogota colombia
981 981 Buenos Aires Argentina Buenos Aires, Argentina Buenos Aires Argentina
982 982 Santo Domingo Rep. Dom santo domingo,republica dominicana santo domingo republica dominicana
983 983 Buenos Aires Argentina Buenos Aires, Argentina Buenos Aires Argentina
984 984 México NaN mexico,d.f mexico d.f
985 985 La Paz Bolivia la paz,bolivia la paz bolivia
986 986 Recife NaN Recife, Recife
987 987 México NaN mexico,d.f mexico d.f
988 988 Buenos Aires Argentina Buenos Aires, Argentina Buenos Aires Argentina
989 989 Buenos Aires Argentina Buenos Aires, Argentina Buenos Aires Argentina
990 990 Lisboa Portugal lisbon,portugal lisbon portugal
991 991 São Paulo Brasil sao paulo,brasil sao paulo brasil
992 992 Madrid España madrid,espana madrid espana
993 993 México NaN mexico,d.f mexico d.f
994 994 São Paulo Brasil sao paulo,brasil sao paulo brasil
995 995 São Paulo Brasil sao paulo,brasil sao paulo brasil
996 996 Rio de Janeiro NaN rio de janeiro,rj rio de janeiro rj
997 997 São Paulo Brasil sao paulo,brasil sao paulo brasil
998 998 São Paulo Brasil sao paulo,brasil sao paulo brasil

999 rows × 6 columns


In [ ]: