In [2]:
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# These are all locations in rows that correspond to the original dataset
locs = pd.read_csv("data/just_split_locs.csv")
len(locs)


Out[2]:
13188245

In [3]:
locs.head()


Out[3]:
Id City Division1 Division2
0 0 madrid NaN NaN
1 1 bogota NaN NaN
2 2 santiago chile NaN
3 3 ponta delgada NaN NaN
4 4 buenos aires NaN NaN

In [4]:
# Create a mapping for rows based on the three location divisions
row_map = locs.groupby(["City", "Division1", "Division2"]).groups

In [5]:
row_map[('torello', np.nan, 'barcelona')]


Out[5]:
[5465924, 10780540]

In [6]:
# Move row mappings to slugs
def map_to_slugs(row_map):
    new_map = {}
    for k, v in row_map.items():
        c, d1, d2 = k
        if pd.isnull(d1):
            d1 = ""
        if pd.isnull(d2):
            d2 = ""
        slug = '{},{},{}'.format(c, d1, d2)
        slug = slug.strip(',')
        new_map[slug] = v
    return new_map

In [7]:
slug_row_map = map_to_slugs(row_map)

In [8]:
list(slug_row_map.items())[0]


Out[8]:
('brasilia,df,2000', [8148542, 10253684])

In [9]:
# Sort into most common combination of three loc keys
import operator
d = {k: len(v) for k, v in row_map.items()}
s = sorted(d.items(), key=operator.itemgetter(1), reverse=True)

In [10]:
s[0]


Out[10]:
(('madrid', nan, nan), 2114115)

In [11]:
# Find keys with more than one row
short = [x for x in s if x[1] > 1]

In [12]:
len(short)


Out[12]:
50179

In [13]:
(len(s) - len(short)) / 13000000.0
# 0.6 % of data contains a place name that only occurs once


Out[13]:
0.006198615384615385

DO NOT RUN THIS CODE


In [64]:
# This function creates slugs by combining the 3 loc keys and writes the groups to file
def write_csv(outfile, groups):
    with open(outfile, "w") as f:
        writer = csv.writer(f)
        writer.writerow(["Id", "City", "Division1", "Division2", "Slug"])
        for i, tup in enumerate(groups):
            loc = tup[0]
            city = loc[0]
            div1 = loc[1]
            div2 = loc[2]
            if pd.isnull(div1):
                div1 = ""
            if pd.isnull(div2):
                div2 = ""
            slug = "{},{},{}".format(city, div1, div2)
            row = [i, city, div1, div2, slug.strip(",")]
            writer.writerow(row)

In [65]:
# write_csv("data/top_locs.csv", short)

In [67]:
# This will be cleaned in refine
top_locs_df = pd.read_csv("data/top_locs.csv")
top_locs_df.head()


Out[67]:
Id City Division1 Division2 Slug
0 0 madrid NaN NaN madrid
1 1 barcelona NaN NaN barcelona
2 2 NaN NaN NaN NaN
3 3 mexico NaN NaN mexico
4 4 buenos aires NaN NaN buenos aires

In [12]:
# First cleaning produces a file with "city" keys split as "city (country)"
# for entries that are that applicable

In [2]:
# Get header
first_cleaning = pd.read_csv('data/top_locs_first_cleaning_oct_8_2016.csv')

In [3]:
first_cleaning.columns


Out[3]:
Index(['Id', 'City', 'Division1', 'Division2', 'Slug'], dtype='object')

In [13]:
# Parse to push "(country)" to div 2 or 3
def parse_first_cleaning(infile, outfile):
    outrows = []
    header = []
    with open(infile, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        for row in reader:
            id, city, div1, div2, slug = row
            city = city.split("(")
            if len(city) == 1:
                city = city[0]
            elif len(city) == 2:
                city = city[0]
                new_div = city[1]
                new_div = new_div.strip(')')
                if not div1:
                    div2 = new_div
                elif not div2:
                    div3 = new_div
                else:
                    div3 = div3 + new_div
            else:
                print(city)
                raise Exception("this shouldn't happen")
            new_row = [id, city, div1, div2, slug]
            outrows.append(new_row)
    with open(outfile, 'w') as new_f:
        writer = csv.writer(new_f)
        writer.writerow(header)
        writer.writerows(outrows)

In [14]:
# parse_first_cleaning("data/top_locs_first_cleaning_oct_8_2016.csv", "data/top_locs_second_cleaning_oct_8_2016.csv")

In [4]:
# This creates new slugs from the cleaned loc fields
# these slugs will then be cleaned and become the canonical placename source
def parse_second_cleaning(infile, outfile):
    outrows = []
    header = []
    with open(infile, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        header.append('new_slug')
        for row in reader:
            id, city, div1, div2, slug = row
            new_slug = '{},{},{}'.format(city, div1, div2)
            new_slug = new_slug.strip(',')
            new_row = new_row = [id, city, div1, div2, slug, new_slug]
            outrows.append(new_row)
    with open(outfile, 'w') as new_f:
        writer = csv.writer(new_f)
        writer.writerow(header)
        writer.writerows(outrows)

In [5]:
# parse_second_cleaning('data/top_locs_second_cleaning_oct_8_2016.csv', 'data/top_locs_third_cleaning_oct_8_2016.csv')

In [3]:
# Create a final slug using just city and country name
# split new slug and take the first and last split item
# to create a "final slug" this will then be cleaned
# and split to create canonical city and country
def parse_third_cleaning(infile, outfile):
    outrows = []
    header = []
    with open(infile, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        header.append('final_slug')
        for row in reader:
            id, city, div1, div2, slug, new_slug = row
            split_slug = slug.split(',')
            if len(split_slug) > 1:
                city = split_slug[0]
                country = split_slug[-1]
                final_slug = '{},{}'.format(city, country)
            else:
                final_slug = split_slug[0]
            new_row = new_row = [id, city, div1, div2, slug, new_slug, final_slug]
            outrows.append(new_row)
    with open(outfile, 'w') as new_f:
        writer = csv.writer(new_f)
        writer.writerow(header)
        writer.writerows(outrows)

In [4]:
# parse_third_cleaning('data/top_locs_third_cleaning_oct_8_2016.csv', 'data/final_cleaning_oct11_2016.csv')

A quick analysis of our cleaning -


In [9]:
def get_new_slugs(infile, slug_map):
    new_slugs = {}
    with open(infile, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        for row in reader:
            _, _, _, _, slug, _, final_slug = row
            new_slugs.setdefault(final_slug, [])
            new_slugs[final_slug] += slug_map[slug]
    return new_slugs

In [10]:
new_slugs = get_new_slugs('data/final_cleaning_oct11_2016.csv', slug_row_map)

In [11]:
len(new_slugs.keys())


Out[11]:
31409

In [12]:
# Sort into most common combination of three loc keys
import operator
d = {k: len(v) for k, v in new_slugs.items()}
s = sorted(d.items(), key=operator.itemgetter(1), reverse=True)

In [13]:
len(s)


Out[13]:
31409

In [14]:
# Find keys with more than one row
short = [x for x in s if x[1] > 1000]
len(short)


Out[14]:
420

In [15]:
short_d = {k: v for k, v in d.items() if v > 999}

In [16]:
sum(short_d.values())


Out[16]:
12289892

So, on Oct. 19 our clean is level in 420 place names cover 12,289,892 records


In [17]:
def merge_slugs(df, slugs):
    for k, v in slugs.items():
        for i in v:
            df["slug"].iat[i] = k
            splt = k.split(",")
            city = splt[0]
            country = ""
            if len(splt) > 1:
                country = splt[1]
            df["canonical_city"].iat[i] = city
            df["canonical_country"].iat[i] = country
    return df

In [18]:
df = pd.read_csv("data/cleaned_texts.tsv", sep="\t")

In [19]:
df["slug"] = ""
df["canonical_city"] = ""
df["canonical_country"] = ""

In [20]:
df = merge_slugs(df, new_slugs)

In [21]:
def to_int(x):
    try:
        return int(x)
    except:
        return 0

df['pub_year'] = df['pub_year'].apply(to_int)

In [22]:
len(df)


Out[22]:
13188245

In [23]:
df.head(10)


Out[23]:
control_number title uniform_title author publisher pub_location pub_year translation prev_language slug canonical_city canonical_country
0 1000686 chao NaN sauvajon, marc-gilbert escelicer madrid 1972 NaN fre madrid,spain madrid spain
1 10010674 el estado en los ochenta, un regimen policivo NaN rojas h., fernando centro de investigacion y educacion popular bogota 1978 NaN NaN bogota,colombia bogota colombia
2 10017901 animales literarios de chile NaN lafourcade, enrique ediciones de la fourcade santiago, chile 0 NaN NaN santiago,chile santiago chile
3 10043696 popular e popularizante NaN pavao junior, jose de almeida universidade dos acores ponta delgada 1981 NaN NaN ponta delgada ponta delgada
4 10046413 una propuesta justicialista NaN amadeo, eduardo fundacion para la democracia en argentina buenos aires 1982 NaN NaN buenos aires,argentina buenos aires argentina
5 1005636 participacion del trabajador en la gestion de ... NaN pina, rolando e ediciones depalma buenos aires 1968 NaN NaN buenos aires,argentina buenos aires argentina
6 10066466 corona diccionario ingles-espanol, espanol-ingles NaN NaN editorial everest leon 1967 NaN NaN leon leon
7 10067357 puerto rico enfermo NaN cartagena, rafael editorial cultural rio piedras, p.r 1983 NaN NaN rio piedras,puerto rico rio piedras puerto rico
8 10085177 contribuicao ao estudo do carcinoma primitivo ... NaN oliveira, candido de rodrigues rio de janeiro 1951 NaN NaN rio de janeiro,brazil rio de janeiro brazil
9 10085221 cuba NaN norton, francis l publisher not identified new york 1873 NaN NaN new york,usa new york usa

In [24]:
df = df[df.pub_year < 2017]

In [25]:
len(df)


Out[25]:
13187290

In [26]:
df = df[df.pub_year > 1450]

In [27]:
len(df)


Out[27]:
10213098

In [28]:
country_df = df[df.canonical_country != '']

In [29]:
len(country_df) / 13000000.0


Out[29]:
0.6663149230769231

In [30]:
def make_slug(row):
    title = str(row['title'])
    author = str(row['author'])
    slug = title + ',' + author
    return slug

country_df['text_slug'] = country_df.apply(make_slug, axis=1)


/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [33]:
def make_full_slug(row):
    title = str(row['title'])
    author = str(row['author'])
    date = str(row['pub_year'])
    loc = str(row['canonical_city'])
    pub = str(row['publisher'])
    slug = '{},{},{},{},{}'.format(title, author, loc, date, pub)
    return slug

country_df['full_text_slug'] = country_df.apply(make_full_slug, axis=1)


/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [35]:
country_df.to_csv("data/cleaned_texts_oct19_2016.tsv", sep="\t")

In [34]:
country_df.head(1)


Out[34]:
control_number title uniform_title author publisher pub_location pub_year translation prev_language slug canonical_city canonical_country text_slug full_text_slug
0 1000686 chao NaN sauvajon, marc-gilbert escelicer madrid 1972 NaN fre madrid,spain madrid spain chao,sauvajon, marc-gilbert chao,sauvajon, marc-gilbert,madrid,1972,escelicer

In [3]:
df = pd.read_csv("data/cleaned_texts_oct19_2016.tsv", sep="\t")

In [4]:
len(df)


Out[4]:
8662094

In [5]:
df = df.drop_duplicates(subset='full_text_slug')

In [6]:
len(df)


Out[6]:
7240272

In [7]:
df.to_csv("data/cleaned_texts_oct19_2016.tsv", sep="\t")

In [ ]: