This notebook requires tabula-py and Java to be installed.
It uses tabular to extract the RV precision from the appendix table of the previous paper docs/Figueira_etal_2016.pdf.
In [ ]:
import sys
import pandas as pd
import tabula
from tabula import read_pdf
# Tabular needs java 6 or 7!
# This is a hack may not work everywhere.
# I included it because my system java is version 9.
# You will need to point to own location of java.
# https://stackoverflow.com/questions/31414041/how-to-prepend-a-path-to-sys-path-in-python
# May need to manually prepend java location
# using export to PATH before launching jupyter
sys.path = ["/opt/java/jre1.7.0_79/bin"] + sys.path
In [ ]:
# Specify paper
paper = "../Figueira_etal_2016.pdf"
pages = [15, 16, 17]
In [ ]:
# Read in the table from the pdf
df = read_pdf(paper, pages=pages, guess=True)
In [ ]:
# There is an extra line of headings which need removed.
# There is also a couple more futher in the data from
# the top of each table as it spans 3 pages.
df.head()
In [ ]:
# Remove mistakenly added title rows
# Easily done beacuse they do not start with "M"
df = df[df.Simulation.str.startswith("M")]
df.head()
In [ ]:
# Format the column names
print(df.columns)
df.columns = df.columns.str.replace(" ", "_")
df.columns = df.columns.str.replace("σ", "")
df.columns = df.columns.str.replace("(", "")
df.columns = df.columns.str.replace(")", "")
df.columns = df.columns.str.replace(".", "")
df.columns
In [ ]:
# Turing RV precision values to floats
print("Before:\n", df.dtypes)
df["RV_Cond_1"] = df.RV_Cond_1.astype(float)
df["RV_Cond_2"] = df.RV_Cond_2.astype(float)
df["RV_Cond_3"] = df.RV_Cond_3.astype(float)
print("\nAfter:\n", df.dtypes)
In [ ]:
# Add units to headers to save
hdr = df.columns
new_header = [
hdr[0],
hdr[1] + "[m/s]",
hdr[2] + "[m/s]",
hdr[3] + "[m/s]",
] # Adjust header to save results
new_header
In [ ]:
# Save Results to file
f = "../../data/precision_figueira_2016.dat"
df.to_csv(f, mode="w", sep="\t", float_format="%6.2f", header=new_header, index=False)
In [ ]:
# Check read in
newdf = pd.read_csv(f, sep="\t")
newdf.head()
This has successfully imported the precision value from the Figueira et al. (2016) appendix.