In [4]:
import pymysql
import os
import csv
In [5]:
ALL_WIKI_AGGREGATION_QUERY = """
SELECT
timestamp AS month,
SUM(weighted_sum) AS weighted_sum,
SUM(LOG(weighted_sum)) AS weighted_log_sum,
SUM(prediction = "Stub") AS stub_n,
SUM(prediction = "Start") AS start_n,
SUM(prediction = "C") AS c_n,
SUM(prediction = "B") AS b_n,
SUM(prediction = "GA") AS ga_n,
SUM(prediction = "FA") AS fa_n,
COUNT(*) AS n
FROM {datasets_db_name}.monthly_wp10_enwiki
GROUP BY month;
"""
WIKIPROJECT_AGGREGATION_QUERY = """
SELECT
monthly_aq.timestamp AS month,
SUM(weighted_sum) AS weighted_sum,
SUM(LOG(weighted_sum)) AS weighted_log_sum,
SUM(prediction = "Stub") AS stub_n,
SUM(prediction = "Start") AS start_n,
SUM(prediction = "C") AS c_n,
SUM(prediction = "B") AS b_n,
SUM(prediction = "GA") AS ga_n,
SUM(prediction = "FA") AS fa_n,
COUNT(*) AS n
FROM {enwiki_db_name}.page AS talk
INNER JOIN {enwiki_db_name}.page AS article ON
talk.page_title = article.page_title AND
article.page_namespace = 0
INNER JOIN {enwiki_db_name}.templatelinks USE INDEX (tl_namespace) ON
tl_from = talk.page_id
INNER JOIN {datasets_db_name}.monthly_wp10_enwiki AS monthly_aq ON
article.page_id = monthly_aq.page_id
WHERE
talk.page_namespace = 1 AND
tl_namespace = 10 AND
(
tl_title = %(project_template)s OR
tl_title IN (
SELECT page.page_title
FROM {enwiki_db_name}.pagelinks
INNER JOIN {enwiki_db_name}.page ON page_id = pl_from
WHERE
pl_namespace = 10 AND
pl_title = %(project_template)s AND
pl_from_namespace = 10 AND
page_is_redirect
)
)
GROUP BY month;
"""
In [6]:
class DBMonthlyStats:
def __init__(self, config):
self.conn = pymysql.connect(
host=config.get('database', 'replica_host'),
read_default_file=config.get('database', 'read_default_file'))
self.all_wiki_aggregation_query = ALL_WIKI_AGGREGATION_QUERY.format(
datasets_db_name=config.get('database', 'datasets_db_name'))
self.wikiproject_aggregation_query = WIKIPROJECT_AGGREGATION_QUERY.format(
datasets_db_name=config.get('database', 'datasets_db_name'),
enwiki_db_name=config.get('database', 'enwiki_db_name'))
def all_wiki_aggregation(self):
"""
Generate a cross-wiki monthly-aggregate dataset.
Returns a cursor that iterates over tuples (rows of the result set).
"""
with self.conn.cursor() as cursor:
cursor.execute(self.all_wiki_aggregation_query)
return cursor
def wikiproject_aggregation(self, project_template):
"""
Genrerate a wikiproject-specific monthly-aggregate dataset.
Returns a cursor that iterates over tuples (rows of the result set).
"""
with self.conn.cursor() as cursor:
cursor.execute(self.wikiproject_aggregation_query,
{'project_template': project_template})
return cursor
def dump_aggregation(cursor, file):
headers = [i[0] for i in cursor.description]
writer = csv.writer(file, delimiter='\t', quoting=csv.QUOTE_NONE, fieldnames=headers)
writer.writeheader()
for row in cursor:
writer.writerow(row)
In [ ]: