In [10]:
import pandas as pd # Beautiful tool for data wrangling! e.g. '!pip install pandas' from a Notebook
# See https://mariadb.com/blog/how-connect-python-programs-mariadb e.g. '!pip install mysql' from Notebook
import MySQLdb
import re
from collections import Counter
import os
from collections import OrderedDict
import random
import pickle
pd.set_option("display.max_rows",35) # Useful when having large Pandas DataFrames like we do here
In [11]:
conn = MySQLdb.connect(user='mos', passwd='', db='monuments_db', charset='utf8', use_unicode=True)
cursor = conn.cursor()
cursor.execute("SET NAMES utf8")
In [15]:
final_tables = pickle.load(open("./final_tables.pickle","rb")) # monuments_all and admin etc removed
final_tables
Out[15]:
In [22]:
def create_source_table_value_examples():
"""Takes a Pandas DataFrame object and writes wikitables to files named after the table they are
produced from to {the current Directory}/wikitables/
"""
final_tables = pickle.load(open("./final_tables.pickle","rb")) # monuments_all and admin etc removed
for table in final_tables:
sql = "SELECT * FROM monuments_all"
df = pd.io.sql.read_sql(conn.escape_string(sql), conn)
h1 = "= Non-standardized fields from table " + table + "=\n"
column_tables = []
lang = df.lang.sample(n=1).to_string().split(" ")[1] # A very complicated way of getting one value!
columns = df.columns
page_sections = []
for column in columns:
# the listed fields we avoid are assumed to be standardized
if column not in ["country","lang","project","changed","lat","lon","lat_int","lon_int"]:
h2 = "== 10 random samples from field " + column + " ==\n"
values = df[column]
try:
sample = values.sample(n=10)
examples = sample.values
except ValueError as e:
print("country: {} cannot be sampled.\n Error: {}\n Total objects is {}. Skipping country.".format(table, e, len(values)))
break
table_header = '{| class="wikitable" style="width: 675px;\n'
table_name = '|+ '+table + "-" + column + " 10 random samples\n"
# create table columns
table_columns = "! scope='col' style='width: 225px;' |" + str(column) + "\n" + \
"! scope='col' style='width: 225px; height: 20px;'|" + "Conversion \n" + \
"! scope='col' style='width: 225px;' | Comment\n|-\n"
table_rows = []
for example in examples:
row="| style='height: 20px;'| " + str(example) + "\n|\n|\n|-\n"
table_rows.append(row)
table_rows_str = "".join(table_rows)
# Fill in examples values from the first record in the table
table_footer = "\n|}"
else:
continue
column_tables.append(h2)
column_table = table_header + table_name + table_columns + table_rows_str[:-1] + table_footer
column_tables.append(column_table)
wikipage = h1 + "\n".join(column_tables)
if os.path.isdir("./langfiles_source"):
#print(wikipage)
out = open("./langfiles_source/" + lang + ".examples","w")
out.write(wikipage)
out.flush()
print("Directory ./langfiles_source exists. Wrote file {}".format(out.name))
out.close()
else:
os.mkdir("./langfiles_source")
with open("./langfiles_source/" + lang + ".examples","w") as out:
out.write(wikipage)
print("./langfiles_source doesn't exist. Wrote file {}".format(out.name))
if os.path.isdir("./countryfiles_source"):
#print(wikipage)
out = open("./countryfiles_source/" + table + ".examples","w")
out.write(wikipage)
out.flush()
print("Directory ./countryfiles_source exists. Wrote file {}".format(out.name))
out.close()
else:
os.mkdir("./countryfiles_source")
with open("./countryfiles_source/" + table + ".examples","w") as out:
out.write(wikipage)
print("./countryfiles_source doesn't exist. Wrote file {}".format(out.name)")
In [23]:
create_source_table_value_examples()
In [ ]: