notebook.community

Edit and run



In [2]:

    
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [2]:

    
# These are all locations in rows that correspond to the original dataset
locs = pd.read_csv("data/just_split_locs.csv")
len(locs)









    Out[2]:





13188245



In [3]:

    
locs.head()









    Out[3]:






  
    
      
      Id
      City
      Division1
      Division2
    
  
  
    
      0
      0
      madrid
      NaN
      NaN
    
    
      1
      1
      bogota
      NaN
      NaN
    
    
      2
      2
      santiago
      chile
      NaN
    
    
      3
      3
      ponta delgada
      NaN
      NaN
    
    
      4
      4
      buenos aires
      NaN
      NaN



In [4]:

    
# Create a mapping for rows based on the three location divisions
row_map = locs.groupby(["City", "Division1", "Division2"]).groups



In [5]:

    
row_map[('torello', np.nan, 'barcelona')]









    Out[5]:





[5465924, 10780540]



In [6]:

    
# Move row mappings to slugs
def map_to_slugs(row_map):
    new_map = {}
    for k, v in row_map.items():
        c, d1, d2 = k
        if pd.isnull(d1):
            d1 = ""
        if pd.isnull(d2):
            d2 = ""
        slug = '{},{},{}'.format(c, d1, d2)
        slug = slug.strip(',')
        new_map[slug] = v
    return new_map



In [7]:

    
slug_row_map = map_to_slugs(row_map)



In [8]:

    
list(slug_row_map.items())[0]









    Out[8]:





('brasilia,df,2000', [8148542, 10253684])



In [9]:

    
# Sort into most common combination of three loc keys
import operator
d = {k: len(v) for k, v in row_map.items()}
s = sorted(d.items(), key=operator.itemgetter(1), reverse=True)



In [10]:

    
s[0]









    Out[10]:





(('madrid', nan, nan), 2114115)



In [11]:

    
# Find keys with more than one row
short = [x for x in s if x[1] > 1]



In [12]:

    
len(short)









    Out[12]:





50179



In [13]:

    
(len(s) - len(short)) / 13000000.0
# 0.6 % of data contains a place name that only occurs once









    Out[13]:





0.006198615384615385

DO NOT RUN THIS CODE



In [64]:

    
# This function creates slugs by combining the 3 loc keys and writes the groups to file
def write_csv(outfile, groups):
    with open(outfile, "w") as f:
        writer = csv.writer(f)
        writer.writerow(["Id", "City", "Division1", "Division2", "Slug"])
        for i, tup in enumerate(groups):
            loc = tup[0]
            city = loc[0]
            div1 = loc[1]
            div2 = loc[2]
            if pd.isnull(div1):
                div1 = ""
            if pd.isnull(div2):
                div2 = ""
            slug = "{},{},{}".format(city, div1, div2)
            row = [i, city, div1, div2, slug.strip(",")]
            writer.writerow(row)



In [65]:

    
# write_csv("data/top_locs.csv", short)



In [67]:

    
# This will be cleaned in refine
top_locs_df = pd.read_csv("data/top_locs.csv")
top_locs_df.head()









    Out[67]:






  
    
      
      Id
      City
      Division1
      Division2
      Slug
    
  
  
    
      0
      0
      madrid
      NaN
      NaN
      madrid
    
    
      1
      1
      barcelona
      NaN
      NaN
      barcelona
    
    
      2
      2
      NaN
      NaN
      NaN
      NaN
    
    
      3
      3
      mexico
      NaN
      NaN
      mexico
    
    
      4
      4
      buenos aires
      NaN
      NaN
      buenos aires



In [12]:

    
# First cleaning produces a file with "city" keys split as "city (country)"
# for entries that are that applicable



In [2]:

    
# Get header
first_cleaning = pd.read_csv('data/top_locs_first_cleaning_oct_8_2016.csv')



In [3]:

    
first_cleaning.columns









    Out[3]:





Index(['Id', 'City', 'Division1', 'Division2', 'Slug'], dtype='object')



In [13]:

    
# Parse to push "(country)" to div 2 or 3
def parse_first_cleaning(infile, outfile):
    outrows = []
    header = []
    with open(infile, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        for row in reader:
            id, city, div1, div2, slug = row
            city = city.split("(")
            if len(city) == 1:
                city = city[0]
            elif len(city) == 2:
                city = city[0]
                new_div = city[1]
                new_div = new_div.strip(')')
                if not div1:
                    div2 = new_div
                elif not div2:
                    div3 = new_div
                else:
                    div3 = div3 + new_div
            else:
                print(city)
                raise Exception("this shouldn't happen")
            new_row = [id, city, div1, div2, slug]
            outrows.append(new_row)
    with open(outfile, 'w') as new_f:
        writer = csv.writer(new_f)
        writer.writerow(header)
        writer.writerows(outrows)



In [14]:

    
# parse_first_cleaning("data/top_locs_first_cleaning_oct_8_2016.csv", "data/top_locs_second_cleaning_oct_8_2016.csv")



In [4]:

    
# This creates new slugs from the cleaned loc fields
# these slugs will then be cleaned and become the canonical placename source
def parse_second_cleaning(infile, outfile):
    outrows = []
    header = []
    with open(infile, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        header.append('new_slug')
        for row in reader:
            id, city, div1, div2, slug = row
            new_slug = '{},{},{}'.format(city, div1, div2)
            new_slug = new_slug.strip(',')
            new_row = new_row = [id, city, div1, div2, slug, new_slug]
            outrows.append(new_row)
    with open(outfile, 'w') as new_f:
        writer = csv.writer(new_f)
        writer.writerow(header)
        writer.writerows(outrows)



In [5]:

    
# parse_second_cleaning('data/top_locs_second_cleaning_oct_8_2016.csv', 'data/top_locs_third_cleaning_oct_8_2016.csv')



In [3]:

    
# Create a final slug using just city and country name
# split new slug and take the first and last split item
# to create a "final slug" this will then be cleaned
# and split to create canonical city and country
def parse_third_cleaning(infile, outfile):
    outrows = []
    header = []
    with open(infile, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        header.append('final_slug')
        for row in reader:
            id, city, div1, div2, slug, new_slug = row
            split_slug = slug.split(',')
            if len(split_slug) > 1:
                city = split_slug[0]
                country = split_slug[-1]
                final_slug = '{},{}'.format(city, country)
            else:
                final_slug = split_slug[0]
            new_row = new_row = [id, city, div1, div2, slug, new_slug, final_slug]
            outrows.append(new_row)
    with open(outfile, 'w') as new_f:
        writer = csv.writer(new_f)
        writer.writerow(header)
        writer.writerows(outrows)



In [4]:

    
# parse_third_cleaning('data/top_locs_third_cleaning_oct_8_2016.csv', 'data/final_cleaning_oct11_2016.csv')

A quick analysis of our cleaning -



In [9]:

    
def get_new_slugs(infile, slug_map):
    new_slugs = {}
    with open(infile, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        for row in reader:
            _, _, _, _, slug, _, final_slug = row
            new_slugs.setdefault(final_slug, [])
            new_slugs[final_slug] += slug_map[slug]
    return new_slugs



In [10]:

    
new_slugs = get_new_slugs('data/final_cleaning_oct11_2016.csv', slug_row_map)



In [11]:

    
len(new_slugs.keys())









    Out[11]:





31409



In [12]:

    
# Sort into most common combination of three loc keys
import operator
d = {k: len(v) for k, v in new_slugs.items()}
s = sorted(d.items(), key=operator.itemgetter(1), reverse=True)



In [13]:

    
len(s)









    Out[13]:





31409



In [14]:

    
# Find keys with more than one row
short = [x for x in s if x[1] > 1000]
len(short)









    Out[14]:





420



In [15]:

    
short_d = {k: v for k, v in d.items() if v > 999}



In [16]:

    
sum(short_d.values())









    Out[16]:





12289892

So, on Oct. 19 our clean is level in 420 place names cover 12,289,892 records



In [17]:

    
def merge_slugs(df, slugs):
    for k, v in slugs.items():
        for i in v:
            df["slug"].iat[i] = k
            splt = k.split(",")
            city = splt[0]
            country = ""
            if len(splt) > 1:
                country = splt[1]
            df["canonical_city"].iat[i] = city
            df["canonical_country"].iat[i] = country
    return df



In [18]:

    
df = pd.read_csv("data/cleaned_texts.tsv", sep="\t")



In [19]:

    
df["slug"] = ""
df["canonical_city"] = ""
df["canonical_country"] = ""



In [20]:

    
df = merge_slugs(df, new_slugs)



In [21]:

    
def to_int(x):
    try:
        return int(x)
    except:
        return 0

df['pub_year'] = df['pub_year'].apply(to_int)



In [22]:

    
len(df)









    Out[22]:





13188245



In [23]:

    
df.head(10)









    Out[23]:






  
    
      
      control_number
      title
      uniform_title
      author
      publisher
      pub_location
      pub_year
      translation
      prev_language
      slug
      canonical_city
      canonical_country
    
  
  
    
      0
      1000686
      chao
      NaN
      sauvajon, marc-gilbert
      escelicer
      madrid
      1972
      NaN
      fre
      madrid,spain
      madrid
      spain
    
    
      1
      10010674
      el estado en los ochenta, un regimen policivo
      NaN
      rojas h., fernando
      centro de investigacion y educacion popular
      bogota
      1978
      NaN
      NaN
      bogota,colombia
      bogota
      colombia
    
    
      2
      10017901
      animales literarios de chile
      NaN
      lafourcade, enrique
      ediciones de la fourcade
      santiago, chile
      0
      NaN
      NaN
      santiago,chile
      santiago
      chile
    
    
      3
      10043696
      popular e popularizante
      NaN
      pavao junior, jose de almeida
      universidade dos acores
      ponta delgada
      1981
      NaN
      NaN
      ponta delgada
      ponta delgada
      
    
    
      4
      10046413
      una propuesta justicialista
      NaN
      amadeo, eduardo
      fundacion para la democracia en argentina
      buenos aires
      1982
      NaN
      NaN
      buenos aires,argentina
      buenos aires
      argentina
    
    
      5
      1005636
      participacion del trabajador en la gestion de ...
      NaN
      pina, rolando e
      ediciones depalma
      buenos aires
      1968
      NaN
      NaN
      buenos aires,argentina
      buenos aires
      argentina
    
    
      6
      10066466
      corona diccionario ingles-espanol, espanol-ingles
      NaN
      NaN
      editorial everest
      leon
      1967
      NaN
      NaN
      leon
      leon
      
    
    
      7
      10067357
      puerto rico enfermo
      NaN
      cartagena, rafael
      editorial cultural
      rio piedras, p.r
      1983
      NaN
      NaN
      rio piedras,puerto rico
      rio piedras
      puerto rico
    
    
      8
      10085177
      contribuicao ao estudo do carcinoma primitivo ...
      NaN
      oliveira, candido de
      rodrigues
      rio de janeiro
      1951
      NaN
      NaN
      rio de janeiro,brazil
      rio de janeiro
      brazil
    
    
      9
      10085221
      cuba
      NaN
      norton, francis l
      publisher not identified
      new york
      1873
      NaN
      NaN
      new york,usa
      new york
      usa



In [24]:

    
df = df[df.pub_year < 2017]



In [25]:

    
len(df)









    Out[25]:





13187290



In [26]:

    
df = df[df.pub_year > 1450]



In [27]:

    
len(df)









    Out[27]:





10213098



In [28]:

    
country_df = df[df.canonical_country != '']



In [29]:

    
len(country_df) / 13000000.0









    Out[29]:





0.6663149230769231



In [30]:

    
def make_slug(row):
    title = str(row['title'])
    author = str(row['author'])
    slug = title + ',' + author
    return slug

country_df['text_slug'] = country_df.apply(make_slug, axis=1)









    



/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [33]:

    
def make_full_slug(row):
    title = str(row['title'])
    author = str(row['author'])
    date = str(row['pub_year'])
    loc = str(row['canonical_city'])
    pub = str(row['publisher'])
    slug = '{},{},{},{},{}'.format(title, author, loc, date, pub)
    return slug

country_df['full_text_slug'] = country_df.apply(make_full_slug, axis=1)









    



/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [35]:

    
country_df.to_csv("data/cleaned_texts_oct19_2016.tsv", sep="\t")



In [34]:

    
country_df.head(1)









    Out[34]:






  
    
      
      control_number
      title
      uniform_title
      author
      publisher
      pub_location
      pub_year
      translation
      prev_language
      slug
      canonical_city
      canonical_country
      text_slug
      full_text_slug
    
  
  
    
      0
      1000686
      chao
      NaN
      sauvajon, marc-gilbert
      escelicer
      madrid
      1972
      NaN
      fre
      madrid,spain
      madrid
      spain
      chao,sauvajon, marc-gilbert
      chao,sauvajon, marc-gilbert,madrid,1972,escelicer



In [3]:

    
df = pd.read_csv("data/cleaned_texts_oct19_2016.tsv", sep="\t")



In [4]:

    
len(df)









    Out[4]:





8662094



In [5]:

    
df = df.drop_duplicates(subset='full_text_slug')



In [6]:

    
len(df)









    Out[6]:





7240272



In [7]:

    
df.to_csv("data/cleaned_texts_oct19_2016.tsv", sep="\t")



In [ ]:

	Id	City	Division1	Division2
0	0	madrid	NaN	NaN
1	1	bogota	NaN	NaN
2	2	santiago	chile	NaN
3	3	ponta delgada	NaN	NaN
4	4	buenos aires	NaN	NaN

	Id	City	Division1	Division2	Slug
0	0	madrid	NaN	NaN	madrid
1	1	barcelona	NaN	NaN	barcelona
2	2	NaN	NaN	NaN	NaN
3	3	mexico	NaN	NaN	mexico
4	4	buenos aires	NaN	NaN	buenos aires

	control_number	title	uniform_title	author	publisher	pub_location	pub_year	translation	prev_language	slug	canonical_city	canonical_country
0	1000686	chao	NaN	sauvajon, marc-gilbert	escelicer	madrid	1972	NaN	fre	madrid,spain	madrid	spain
1	10010674	el estado en los ochenta, un regimen policivo	NaN	rojas h., fernando	centro de investigacion y educacion popular	bogota	1978	NaN	NaN	bogota,colombia	bogota	colombia
2	10017901	animales literarios de chile	NaN	lafourcade, enrique	ediciones de la fourcade	santiago, chile	0	NaN	NaN	santiago,chile	santiago	chile
3	10043696	popular e popularizante	NaN	pavao junior, jose de almeida	universidade dos acores	ponta delgada	1981	NaN	NaN	ponta delgada	ponta delgada
4	10046413	una propuesta justicialista	NaN	amadeo, eduardo	fundacion para la democracia en argentina	buenos aires	1982	NaN	NaN	buenos aires,argentina	buenos aires	argentina
5	1005636	participacion del trabajador en la gestion de ...	NaN	pina, rolando e	ediciones depalma	buenos aires	1968	NaN	NaN	buenos aires,argentina	buenos aires	argentina
6	10066466	corona diccionario ingles-espanol, espanol-ingles	NaN	NaN	editorial everest	leon	1967	NaN	NaN	leon	leon
7	10067357	puerto rico enfermo	NaN	cartagena, rafael	editorial cultural	rio piedras, p.r	1983	NaN	NaN	rio piedras,puerto rico	rio piedras	puerto rico
8	10085177	contribuicao ao estudo do carcinoma primitivo ...	NaN	oliveira, candido de	rodrigues	rio de janeiro	1951	NaN	NaN	rio de janeiro,brazil	rio de janeiro	brazil
9	10085221	cuba	NaN	norton, francis l	publisher not identified	new york	1873	NaN	NaN	new york,usa	new york	usa