Extracting data from the FOA/SIPRI nuclear explosions report

Requirements: pdfplumber & pandas


In [1]:
import pdfplumber
import pandas as pd
import re

Parse the PDF


In [2]:
path = "../documents/sipri-report-original.pdf"
pdf = pdfplumber.open(path, pages=range(23, 41))

In [3]:
columns = [
    "date", "origin_t", "id_no", "country", "region", "source",
    "latitude", "longitude", "mb", "Ms", "depth", "yield_1", "yield_u",
    "purpose", "name", "type"
]

In [4]:
def fix_line(line):
    fix_1 = re.sub(r" (WR|WE|PNE|WE/SAM|WR/F/S|FMS|WR/SE|WR/SAM|"
                  r"WR/PNE|WR/FMS|WE/WR|SE|SAM|PNE:PLO|PNErPLO|PNE:V)\|", r"|\1|", line)
    fix_1_sp = fix_1.split("|")
    if fix_1_sp[12] in ("WR",):
        fix_2 = "|".join(fix_1_sp[:12] + [""] + fix_1_sp[12:])
    else:
        fix_2 = fix_1
    return fix_2

In [5]:
def extract_table_from_page(page):
    # Remove stray apostrophes (from scan)
    filtered = page.filter(lambda x: x.get("text", None) not in ("'", " "))

    # Chop off header if present
    page_words = filtered.extract_words(x_tolerance=1, y_tolerance=3)
    if page_words[0]["text"] == "Date":
        top = page_words[0]["bottom"]
    else:
        top = 0
        
    cropped = filtered.crop((0, top, page.width, 770), strict=True)
    
    # Extract table
    table = cropped.extract_table(
        v="gutters", h="gutters", 
        x_tolerance=1, y_tolerance=3,
        gutter_min_height=4, gutter_min_width=4)
    
    fixed_table = [ [ z or None for z in fix_line("|".join(x or "" for x in line)).split("|") ]
        for line in table ]
    
    df = pd.DataFrame(fixed_table)

    if len(df.columns) == 17:
        df = df[df.columns[:16]]

    df.columns = columns
    return df

In [6]:
tables = [ extract_table_from_page(p) for p in pdf.pages ]
data = pd.concat(tables).reset_index(drop=True)

Fix obvious OCR errors in the data


In [7]:
fixed = data.copy()

In [8]:
date_fixes = {
    "62101S": "621018",
    "8S0623": "880623"
}

In [9]:
fixed["date"] = data["date"].apply(date_fixes.get).fillna(data["date"]).astype(int)

In [10]:
fixed["date_long"] = (fixed["date"] + 19000000)
fixed["year"] = fixed["date_long"] // 10000

In [11]:
country_fixes = {
    "USER": "USSR",
    "USSP.": "USSR",
    "OSSR": "USSR",
    "USSE": "USSR",
    "FEANCE": "FRANCE",
    "CHIHA": "CHINA",
}

In [12]:
fixed["country"] = data["country"].apply(country_fixes.get).fillna(data["country"])

In [13]:
fixed["country"].value_counts()


Out[13]:
USA       1032
USSR       714
FRANCE     210
UK          45
CHINA       45
INDIA        3
PAKIST       2
Name: country, dtype: int64

In [14]:
id_no_fixes = {
    "63 011": "63011"
}

fixed["id_no"] = data["id_no"].apply(id_no_fixes.get).fillna(data["id_no"]).astype(int)

In [15]:
origin_t_fixes = {
    "2S906.3": "25906.3",
    "75S57.8": "75957.8",
    "O.O": "0.0",
    "190000. 0": "190000.0"
}

fixed["origin_t"] = data["origin_t"].apply(origin_t_fixes.get).fillna(data["origin_t"]).astype(float)

In [16]:
latitude_fixes = {
    "3 7.100": "37.100"
}

fixed["latitude"] = data["latitude"].apply(latitude_fixes.get).fillna(data["latitude"]).astype(float)

In [17]:
longitude_fixes = {
    ": 0.000": "0.000",
    "-13 9.000": "-139.000",
    ". 47.000": "47.000",
    "11.ISO": "77.790",
    ": 78.910": "78.910",
    "5.03 0": "5.030",
    ":  0.000": "0.000"
}

fixed["longitude"] = data["longitude"].apply(longitude_fixes.get).fillna(data["longitude"]).astype(float)

In [18]:
mb_fixes = {
    "S.O": "5.0",
    "O.O": "0.0"
}

fixed["mb"] = data["mb"].apply(mb_fixes.get).fillna(data["mb"]).astype(float)

In [19]:
Ms_fixes = {
    "S.O": "5.0",
    "O.O": "0.0",
    ",0.0": "0.0"
}

fixed["Ms"] = data["Ms"].apply(Ms_fixes.get).fillna(data["Ms"]).astype(float)

In [20]:
depth_fixes = {
    "O.000": "0.000",
    "o.ooo": "0.000",
    "-6.80O": "-6.800",
    "-o.ooi": "-0.001",
    "-O.001": "-0.001",
    "-o.ioo": "-0.001",
    "-0.25O": "-0.250"
}

fixed["depth"] = data["depth"].apply(depth_fixes.get).fillna(data["depth"]).astype(float)

In [21]:
yield_1_fixes = {
    "O.O": "0.0",
    "O .O": "0.0",
    "0:0": "0.0",
    "Q,0": "0.0",
    "6:0": "0.0",
    "(JiO": "0.0",
    "20;0": "20.0",
    "5.3,": "5.3",
    "190. 0": "190.0",
    "286:o": "280.0",
    "120..0": "120.0",
    "51\\ 0": "51.0",
    "900SP": "900.0",
    "27..0": "27.0",
}

fixed["yield_1"] = data["yield_1"].apply(yield_1_fixes.get).fillna(data["yield_1"]).astype(float)

In [22]:
yield_u_fixes = {
    "S.O": "5.0",
    ": 20.0": "20.0",
    ". 20.0": "20.0",
    "20.O": "20.0",
    "20-0": "20.0"
}

fixed["yield_u"] = data["yield_u"].apply(yield_u_fixes.get).fillna(data["yield_u"]).astype(float)

In [23]:
purpose_fixes = {
    "Wt": "WE",
    "WP.": "WR",
    "KR": "WR",
    "HP.": "WR",
    "PHE": "PNE",
    "PNE: PLO": "PNE:PLO",
    "PNErPLO": "PNE:PLO",
    "PNE-.PLO": "PNE:PLO",
}

fixed["purpose"] = data["purpose"].apply(purpose_fixes.get).fillna(data["purpose"])
fixed["purpose"].value_counts()


Out[23]:
WR         1495
WE          181
PNE         153
SE           71
FMS          33
PNE:PLO      27
SAM          25
WR/SE        11
PNE:V         7
WR/FMS        6
WR/SAM        6
SB            5
TRANSP        4
WE/SAM        3
WE/WR         3
WR/PNE        3
WR/WE         2
COMBAT        2
PNE/WR        2
FMS/WR        2
SE/WR         2
WR/F/SA       2
WR/P/SA       1
WR/WE/S       1
WR/P/S        1
WR/F/S        1
ME            1
Name: purpose, dtype: int64

In [24]:
type_fixes = {
    "SHAFT/GE": "SHAFT/GR",
    "BAEGE": "BARGE",
}

fixed["type"] = data["type"].apply(type_fixes.get).fillna(data["type"])
fixed["type"].value_counts()


Out[24]:
SHAFT       1015
TUNNEL       310
ATMOSPH      185
SHAFT/GR      85
AIRDROP       78
TOWER         75
BALLOON       62
SURFACE       62
SHAFT/LG      56
BARGE         40
UG            32
ROCKET        13
GALLERY       13
CRATER         9
UW             8
SPACE          4
WATER SU       1
SHIP           1
WATERSUR       1
MINE           1
Name: type, dtype: int64

Write the CSV


In [25]:
fixed.to_csv("../data/sipri-report-explosions.csv", index=False)