In [1]:
import pdfplumber
import pandas as pd
import re
In [2]:
path = "../documents/sipri-report-original.pdf"
pdf = pdfplumber.open(path, pages=range(23, 41))
In [3]:
columns = [
"date", "origin_t", "id_no", "country", "region", "source",
"latitude", "longitude", "mb", "Ms", "depth", "yield_1", "yield_u",
"purpose", "name", "type"
]
In [4]:
def fix_line(line):
fix_1 = re.sub(r" (WR|WE|PNE|WE/SAM|WR/F/S|FMS|WR/SE|WR/SAM|"
r"WR/PNE|WR/FMS|WE/WR|SE|SAM|PNE:PLO|PNErPLO|PNE:V)\|", r"|\1|", line)
fix_1_sp = fix_1.split("|")
if fix_1_sp[12] in ("WR",):
fix_2 = "|".join(fix_1_sp[:12] + [""] + fix_1_sp[12:])
else:
fix_2 = fix_1
return fix_2
In [5]:
def extract_table_from_page(page):
# Remove stray apostrophes (from scan)
filtered = page.filter(lambda x: x.get("text", None) not in ("'", " "))
# Chop off header if present
page_words = filtered.extract_words(x_tolerance=1, y_tolerance=3)
if page_words[0]["text"] == "Date":
top = page_words[0]["bottom"]
else:
top = 0
cropped = filtered.crop((0, top, page.width, 770), strict=True)
# Extract table
table = cropped.extract_table(
v="gutters", h="gutters",
x_tolerance=1, y_tolerance=3,
gutter_min_height=4, gutter_min_width=4)
fixed_table = [ [ z or None for z in fix_line("|".join(x or "" for x in line)).split("|") ]
for line in table ]
df = pd.DataFrame(fixed_table)
if len(df.columns) == 17:
df = df[df.columns[:16]]
df.columns = columns
return df
In [6]:
tables = [ extract_table_from_page(p) for p in pdf.pages ]
data = pd.concat(tables).reset_index(drop=True)
In [7]:
fixed = data.copy()
In [8]:
date_fixes = {
"62101S": "621018",
"8S0623": "880623"
}
In [9]:
fixed["date"] = data["date"].apply(date_fixes.get).fillna(data["date"]).astype(int)
In [10]:
fixed["date_long"] = (fixed["date"] + 19000000)
fixed["year"] = fixed["date_long"] // 10000
In [11]:
country_fixes = {
"USER": "USSR",
"USSP.": "USSR",
"OSSR": "USSR",
"USSE": "USSR",
"FEANCE": "FRANCE",
"CHIHA": "CHINA",
}
In [12]:
fixed["country"] = data["country"].apply(country_fixes.get).fillna(data["country"])
In [13]:
fixed["country"].value_counts()
Out[13]:
In [14]:
id_no_fixes = {
"63 011": "63011"
}
fixed["id_no"] = data["id_no"].apply(id_no_fixes.get).fillna(data["id_no"]).astype(int)
In [15]:
origin_t_fixes = {
"2S906.3": "25906.3",
"75S57.8": "75957.8",
"O.O": "0.0",
"190000. 0": "190000.0"
}
fixed["origin_t"] = data["origin_t"].apply(origin_t_fixes.get).fillna(data["origin_t"]).astype(float)
In [16]:
latitude_fixes = {
"3 7.100": "37.100"
}
fixed["latitude"] = data["latitude"].apply(latitude_fixes.get).fillna(data["latitude"]).astype(float)
In [17]:
longitude_fixes = {
": 0.000": "0.000",
"-13 9.000": "-139.000",
". 47.000": "47.000",
"11.ISO": "77.790",
": 78.910": "78.910",
"5.03 0": "5.030",
": 0.000": "0.000"
}
fixed["longitude"] = data["longitude"].apply(longitude_fixes.get).fillna(data["longitude"]).astype(float)
In [18]:
mb_fixes = {
"S.O": "5.0",
"O.O": "0.0"
}
fixed["mb"] = data["mb"].apply(mb_fixes.get).fillna(data["mb"]).astype(float)
In [19]:
Ms_fixes = {
"S.O": "5.0",
"O.O": "0.0",
",0.0": "0.0"
}
fixed["Ms"] = data["Ms"].apply(Ms_fixes.get).fillna(data["Ms"]).astype(float)
In [20]:
depth_fixes = {
"O.000": "0.000",
"o.ooo": "0.000",
"-6.80O": "-6.800",
"-o.ooi": "-0.001",
"-O.001": "-0.001",
"-o.ioo": "-0.001",
"-0.25O": "-0.250"
}
fixed["depth"] = data["depth"].apply(depth_fixes.get).fillna(data["depth"]).astype(float)
In [21]:
yield_1_fixes = {
"O.O": "0.0",
"O .O": "0.0",
"0:0": "0.0",
"Q,0": "0.0",
"6:0": "0.0",
"(JiO": "0.0",
"20;0": "20.0",
"5.3,": "5.3",
"190. 0": "190.0",
"286:o": "280.0",
"120..0": "120.0",
"51\\ 0": "51.0",
"900SP": "900.0",
"27..0": "27.0",
}
fixed["yield_1"] = data["yield_1"].apply(yield_1_fixes.get).fillna(data["yield_1"]).astype(float)
In [22]:
yield_u_fixes = {
"S.O": "5.0",
": 20.0": "20.0",
". 20.0": "20.0",
"20.O": "20.0",
"20-0": "20.0"
}
fixed["yield_u"] = data["yield_u"].apply(yield_u_fixes.get).fillna(data["yield_u"]).astype(float)
In [23]:
purpose_fixes = {
"Wt": "WE",
"WP.": "WR",
"KR": "WR",
"HP.": "WR",
"PHE": "PNE",
"PNE: PLO": "PNE:PLO",
"PNErPLO": "PNE:PLO",
"PNE-.PLO": "PNE:PLO",
}
fixed["purpose"] = data["purpose"].apply(purpose_fixes.get).fillna(data["purpose"])
fixed["purpose"].value_counts()
Out[23]:
In [24]:
type_fixes = {
"SHAFT/GE": "SHAFT/GR",
"BAEGE": "BARGE",
}
fixed["type"] = data["type"].apply(type_fixes.get).fillna(data["type"])
fixed["type"].value_counts()
Out[24]:
In [25]:
fixed.to_csv("../data/sipri-report-explosions.csv", index=False)