In [2]:
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
# These are all locations in rows that correspond to the original dataset
locs = pd.read_csv("data/just_split_locs.csv")
len(locs)
Out[2]:
In [3]:
locs.head()
Out[3]:
In [4]:
# Create a mapping for rows based on the three location divisions
row_map = locs.groupby(["City", "Division1", "Division2"]).groups
In [5]:
row_map[('torello', np.nan, 'barcelona')]
Out[5]:
In [6]:
# Move row mappings to slugs
def map_to_slugs(row_map):
new_map = {}
for k, v in row_map.items():
c, d1, d2 = k
if pd.isnull(d1):
d1 = ""
if pd.isnull(d2):
d2 = ""
slug = '{},{},{}'.format(c, d1, d2)
slug = slug.strip(',')
new_map[slug] = v
return new_map
In [7]:
slug_row_map = map_to_slugs(row_map)
In [8]:
list(slug_row_map.items())[0]
Out[8]:
In [9]:
# Sort into most common combination of three loc keys
import operator
d = {k: len(v) for k, v in row_map.items()}
s = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
In [10]:
s[0]
Out[10]:
In [11]:
# Find keys with more than one row
short = [x for x in s if x[1] > 1]
In [12]:
len(short)
Out[12]:
In [13]:
(len(s) - len(short)) / 13000000.0
# 0.6 % of data contains a place name that only occurs once
Out[13]:
In [64]:
# This function creates slugs by combining the 3 loc keys and writes the groups to file
def write_csv(outfile, groups):
with open(outfile, "w") as f:
writer = csv.writer(f)
writer.writerow(["Id", "City", "Division1", "Division2", "Slug"])
for i, tup in enumerate(groups):
loc = tup[0]
city = loc[0]
div1 = loc[1]
div2 = loc[2]
if pd.isnull(div1):
div1 = ""
if pd.isnull(div2):
div2 = ""
slug = "{},{},{}".format(city, div1, div2)
row = [i, city, div1, div2, slug.strip(",")]
writer.writerow(row)
In [65]:
# write_csv("data/top_locs.csv", short)
In [67]:
# This will be cleaned in refine
top_locs_df = pd.read_csv("data/top_locs.csv")
top_locs_df.head()
Out[67]:
In [12]:
# First cleaning produces a file with "city" keys split as "city (country)"
# for entries that are that applicable
In [2]:
# Get header
first_cleaning = pd.read_csv('data/top_locs_first_cleaning_oct_8_2016.csv')
In [3]:
first_cleaning.columns
Out[3]:
In [13]:
# Parse to push "(country)" to div 2 or 3
def parse_first_cleaning(infile, outfile):
outrows = []
header = []
with open(infile, 'r') as f:
reader = csv.reader(f)
header = next(reader)
for row in reader:
id, city, div1, div2, slug = row
city = city.split("(")
if len(city) == 1:
city = city[0]
elif len(city) == 2:
city = city[0]
new_div = city[1]
new_div = new_div.strip(')')
if not div1:
div2 = new_div
elif not div2:
div3 = new_div
else:
div3 = div3 + new_div
else:
print(city)
raise Exception("this shouldn't happen")
new_row = [id, city, div1, div2, slug]
outrows.append(new_row)
with open(outfile, 'w') as new_f:
writer = csv.writer(new_f)
writer.writerow(header)
writer.writerows(outrows)
In [14]:
# parse_first_cleaning("data/top_locs_first_cleaning_oct_8_2016.csv", "data/top_locs_second_cleaning_oct_8_2016.csv")
In [4]:
# This creates new slugs from the cleaned loc fields
# these slugs will then be cleaned and become the canonical placename source
def parse_second_cleaning(infile, outfile):
outrows = []
header = []
with open(infile, 'r') as f:
reader = csv.reader(f)
header = next(reader)
header.append('new_slug')
for row in reader:
id, city, div1, div2, slug = row
new_slug = '{},{},{}'.format(city, div1, div2)
new_slug = new_slug.strip(',')
new_row = new_row = [id, city, div1, div2, slug, new_slug]
outrows.append(new_row)
with open(outfile, 'w') as new_f:
writer = csv.writer(new_f)
writer.writerow(header)
writer.writerows(outrows)
In [5]:
# parse_second_cleaning('data/top_locs_second_cleaning_oct_8_2016.csv', 'data/top_locs_third_cleaning_oct_8_2016.csv')
In [3]:
# Create a final slug using just city and country name
# split new slug and take the first and last split item
# to create a "final slug" this will then be cleaned
# and split to create canonical city and country
def parse_third_cleaning(infile, outfile):
outrows = []
header = []
with open(infile, 'r') as f:
reader = csv.reader(f)
header = next(reader)
header.append('final_slug')
for row in reader:
id, city, div1, div2, slug, new_slug = row
split_slug = slug.split(',')
if len(split_slug) > 1:
city = split_slug[0]
country = split_slug[-1]
final_slug = '{},{}'.format(city, country)
else:
final_slug = split_slug[0]
new_row = new_row = [id, city, div1, div2, slug, new_slug, final_slug]
outrows.append(new_row)
with open(outfile, 'w') as new_f:
writer = csv.writer(new_f)
writer.writerow(header)
writer.writerows(outrows)
In [4]:
# parse_third_cleaning('data/top_locs_third_cleaning_oct_8_2016.csv', 'data/final_cleaning_oct11_2016.csv')
A quick analysis of our cleaning -
In [9]:
def get_new_slugs(infile, slug_map):
new_slugs = {}
with open(infile, 'r') as f:
reader = csv.reader(f)
header = next(reader)
for row in reader:
_, _, _, _, slug, _, final_slug = row
new_slugs.setdefault(final_slug, [])
new_slugs[final_slug] += slug_map[slug]
return new_slugs
In [10]:
new_slugs = get_new_slugs('data/final_cleaning_oct11_2016.csv', slug_row_map)
In [11]:
len(new_slugs.keys())
Out[11]:
In [12]:
# Sort into most common combination of three loc keys
import operator
d = {k: len(v) for k, v in new_slugs.items()}
s = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
In [13]:
len(s)
Out[13]:
In [14]:
# Find keys with more than one row
short = [x for x in s if x[1] > 1000]
len(short)
Out[14]:
In [15]:
short_d = {k: v for k, v in d.items() if v > 999}
In [16]:
sum(short_d.values())
Out[16]:
So, on Oct. 19 our clean is level in 420 place names cover 12,289,892 records
In [17]:
def merge_slugs(df, slugs):
for k, v in slugs.items():
for i in v:
df["slug"].iat[i] = k
splt = k.split(",")
city = splt[0]
country = ""
if len(splt) > 1:
country = splt[1]
df["canonical_city"].iat[i] = city
df["canonical_country"].iat[i] = country
return df
In [18]:
df = pd.read_csv("data/cleaned_texts.tsv", sep="\t")
In [19]:
df["slug"] = ""
df["canonical_city"] = ""
df["canonical_country"] = ""
In [20]:
df = merge_slugs(df, new_slugs)
In [21]:
def to_int(x):
try:
return int(x)
except:
return 0
df['pub_year'] = df['pub_year'].apply(to_int)
In [22]:
len(df)
Out[22]:
In [23]:
df.head(10)
Out[23]:
In [24]:
df = df[df.pub_year < 2017]
In [25]:
len(df)
Out[25]:
In [26]:
df = df[df.pub_year > 1450]
In [27]:
len(df)
Out[27]:
In [28]:
country_df = df[df.canonical_country != '']
In [29]:
len(country_df) / 13000000.0
Out[29]:
In [30]:
def make_slug(row):
title = str(row['title'])
author = str(row['author'])
slug = title + ',' + author
return slug
country_df['text_slug'] = country_df.apply(make_slug, axis=1)
In [33]:
def make_full_slug(row):
title = str(row['title'])
author = str(row['author'])
date = str(row['pub_year'])
loc = str(row['canonical_city'])
pub = str(row['publisher'])
slug = '{},{},{},{},{}'.format(title, author, loc, date, pub)
return slug
country_df['full_text_slug'] = country_df.apply(make_full_slug, axis=1)
In [35]:
country_df.to_csv("data/cleaned_texts_oct19_2016.tsv", sep="\t")
In [34]:
country_df.head(1)
Out[34]:
In [3]:
df = pd.read_csv("data/cleaned_texts_oct19_2016.tsv", sep="\t")
In [4]:
len(df)
Out[4]:
In [5]:
df = df.drop_duplicates(subset='full_text_slug')
In [6]:
len(df)
Out[6]:
In [7]:
df.to_csv("data/cleaned_texts_oct19_2016.tsv", sep="\t")
In [ ]: