In [13]:
# native python
import concurrent.futures
import datetime
import glob
import io
import logging
import os.path
import os
import shlex
import urllib.parse
import json
# numeric stuff
import numpy as np
import scipy.stats
import pandas # for tables
# converting things
import pathlib
import pydash # for functional stuff
import dateutil.parser # garbage date/times
# image processing
import skimage.io
import skimage.transform
from PIL import Image
# plotting
import matplotlib.pyplot as plt # plotting
import matplotlib.dates # date axes
import seaborn
# web requests
import ssl
import requests # urls
import mako.template # html formatting
import IPython.display # notebook
import tqdm # progress bar
%matplotlib inline
# replace default logging
#del logging.root.handlers[0]
#logging.basicConfig(level=logging.INFO)
In [2]:
# Create a sparql query to get all the oil paintings from the wikipedia
query = """
SELECT ?item ?itemLabel ?cid ?_image ?_inception ?_creator ?_creatorLabel WHERE {
# select all paintings
?item wdt:P31 wd:Q3305213.
# made with oil
?item wdt:P186 wd:Q296955.
# written in english
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
# store the image
OPTIONAL { ?item wdt:P18 ?_image. }
# store the creation date
OPTIONAL { ?item wdt:P571 ?_inception. }
# store the creator
OPTIONAL { ?item wdt:P170 ?_creator. }
}
"""
In [3]:
url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
resp = requests.get(url,
params=dict(query=query),
headers={'Accept': 'application/json'})
data = resp.json()
In [4]:
# take a random sample
sample = False
if sample:
data['results']['bindings'] = np.random.choice(data['results']['bindings'], replace=False, size=sample)
len(data['results']['bindings'])
In [5]:
# parse the dates
for row in data['results']['bindings']:
# try and parse the date
if '_inception' in row and row['_inception']['type'] == 'literal':
try:
date = row['_inception']['value']
row['date'] = dateutil.parser.parse(date)
except ValueError as e:
logging.info("not parsed %s\n%s", date, row)
# drop first part
if pydash.has(row, 'item.value'):
entity = row['item']['value'].split("/")[-1]
row['entity'] = entity
In [6]:
rows = []
for row in data['results']['bindings']:
creator = pydash.get(row, '_creatorLabel.value')
name = pydash.get(row, 'itemLabel.value')
datestr = pydash.get(row, '_inception.value')
datetype = pydash.get(row, '_inception.type')
img = pydash.get(row, '_image.value')
entity = pydash.get(row, 'entity')
if datestr and datetype == 'literal':
try:
date = dateutil.parser.parse(datestr)
except:
logging.exception("error %s\n%s", datestr, row)
else:
date = None
rows.append(
dict(
creator=creator,
name=name,
date=date,
img=img,
entity=entity
)
)
In [7]:
df = pandas.DataFrame(data=rows)
# drop anonymous paintings
# df.ix[df.creator == 'anonymous'] = None
# drop missings and duplicates
df = df.dropna().drop_duplicates()
# calculate new index
df = df.reset_index()
def url2name(url):
"""convert url to file tag"""
return urllib.parse.unquote(url.split(':')[-1].replace('FilePath/', 'File:'))
df['wiki_tag'] = df['img'].apply(url2name)
len(df)
Out[7]:
In [8]:
df.set_index('entity').head()
df.to_json('paintings.json')
In [9]:
imgs = df.img
imgs.to_csv('urls.txt', index=False)
In [84]:
paintings_df = pandas.read_json('paintings.json')
paintings_df = paintings_df.reset_index().drop('level_0', axis=1)
paintings_df['chunk'] = df.index // 50
def special2normal(url):
'''convert special path to path'''
return (
urllib.parse.unquote(url)
.replace('/Special:FilePath/', '/File:')
.replace('http://', 'https://')
.replace(' ', '_')
)
paintings_df['descriptionurl'] = paintings_df['img'].apply(special2normal)
In [19]:
# get a token from: /w/api.php?action=query&format=json&meta=tokens&type=csrf%7Clogin
rows = []
groups = df.groupby('chunk')
api_urls = []
for chunk_id, chunk in tqdm.tqdm_notebook(groups, desc='groups'):
names = "|".join(chunk.img.apply(url2name))
url = 'https://commons.wikimedia.org/w/api.php'
params = {
"action": "query",
"titles": names,
"prop": "imageinfo",
"iiprop": "timestamp|thumbmime|bitdepth|dimensions|sha1|url|mediatype|metadata|extmetadata",
"format": "json"
}
headers = {
'Accept': 'application/json',
'user-agent': 'Painting Database'
}
api_urls.append({
"url": url,
"params": params,
"headers": headers
})
json.dump(api_urls, open('api_urls.json', 'w'))
# run scrapy spider for urls
In [31]:
batches = json.load(open('./wiki_batches.json'))
rows = []
for batch in batches:
for i, (wiki_id, page) in enumerate(batch['query']['pages'].items()):
row = {}
assert len(pydash.get(page, 'imageinfo', [])) == 1
row['wiki_page_id'] = pydash.get(page, 'pageid')
row['ns'] = pydash.get(page, 'ns')
row['title'] = pydash.get(page, 'title')
row['url'] = pydash.get(page, 'imageinfo.0.url')
row['height'] = pydash.get(page, 'imageinfo.0.height')
row['width'] = pydash.get(page, 'imageinfo.0.width')
row['descriptionurl'] = pydash.get(page, 'imageinfo.0.descriptionurl')
row['descriptionshorturl'] = pydash.get(page, 'imageinfo.0.descriptionshorturl')
row['sha1'] = pydash.get(page, 'imageinfo.0.sha1').upper()
row['metadata'] = pydash.get(page, 'imageinfo.0.metadata')
rows.append(row)
In [32]:
wikimedia_df = pandas.DataFrame.from_dict(rows)
wikimedia_df.to_json('wikimedia.json')
In [154]:
'''
rows = []
for fname in tqdm.tqdm_notebook(os.listdir('/Users/fedor/data/highres/'), desc='files'):
fullpath = os.path.join('/Users/fedor/data/highres/', fname)
quoted_path = shlex.quote(fullpath)
sha1 = !shasum $quoted_path | cut -f 1 -d ' '
row = {
"filename": fname,
"sha1": sha1[0].strip().upper(),
"size": os.stat(fullpath).st_size
}
rows.append(row)
files_df = pandas.DataFrame(rows)
files_df.to_json('files.json')
'''
Out[154]:
In [ ]:
wikimedia_df = pandas.read_json('wikimedia.json')
wikimedia_df = wikimedia_df.reset_index().drop('index', axis=1)
In [36]:
with open('headers.json') as f:
records = json.load(f)
# headers contain lists of 1 or more items
for record in records:
for key, item in record.items():
if isinstance(item, list) and len(item) == 1:
item = item[0]
record[key] = item
headers_df = pandas.DataFrame.from_records(records)
# convert base36 to base16
headers_df['sha1'] = headers_df['X-Object-Meta-Sha1Base36'].apply(
lambda x: np.base_repr(int(x, base=36), 16).zfill(40)
)
In [155]:
files_df = pandas.read_json('files.json')
# drop index
files_df = files_df.reset_index().drop('index', axis=1)
In [ ]:
In [156]:
sha1 = wikimedia_df.iloc[1]['sha1']
sha1
sha1 = '711F33544E5014D037955009D12B82BBE06C0CEE'
In [157]:
W = wikimedia_df.ix[wikimedia_df.sha1 == sha1].iloc[0]
H = headers_df.ix[headers_df.sha1 == sha1].iloc[0]
F = files_df.ix[files_df.sha1 == sha1].iloc[0]
P = paintings_df.ix[paintings_df.descriptionurl == W.descriptionurl]
W, H, F, P
Out[157]:
In [158]:
wiki_headers = pandas.merge(wikimedia_df, headers_df, how='outer', on='sha1')
wiki_headers_files = pandas.merge(wiki_headers, files_df, how='outer', on='sha1')
In [159]:
merged = pandas.merge(wiki_headers_files, paintings_df, how='outer', on='descriptionurl')
In [160]:
# paintings that were downloaded but no longer on wiki
len(merged.ix[pandas.isnull(merged.descriptionurl)])
Out[160]:
In [164]:
# missing files
merged.ix[pandas.isnull(merged.filename)].iloc[0].descriptionurl
Out[164]:
In [ ]: