Database monthly stats

This notebook contains a basic set of methods for querying monthly article quality statistics.



In [4]:

    
import pymysql 
import os
import csv

Queries

These two queries do the basic database extraction. all_wiki_aggregation query is a simple



In [5]:

    
ALL_WIKI_AGGREGATION_QUERY = """
SELECT
  timestamp AS month,
  SUM(weighted_sum) AS weighted_sum,
  SUM(LOG(weighted_sum)) AS weighted_log_sum,
  SUM(prediction = "Stub") AS stub_n,
  SUM(prediction = "Start") AS start_n,
  SUM(prediction = "C") AS c_n,
  SUM(prediction = "B") AS b_n,
  SUM(prediction = "GA") AS ga_n,
  SUM(prediction = "FA") AS fa_n,
  COUNT(*) AS n
FROM {datasets_db_name}.monthly_wp10_enwiki
GROUP BY month;
"""

WIKIPROJECT_AGGREGATION_QUERY = """
SELECT
  monthly_aq.timestamp AS month,
  SUM(weighted_sum) AS weighted_sum,
  SUM(LOG(weighted_sum)) AS weighted_log_sum,
  SUM(prediction = "Stub") AS stub_n,
  SUM(prediction = "Start") AS start_n,
  SUM(prediction = "C") AS c_n,
  SUM(prediction = "B") AS b_n,
  SUM(prediction = "GA") AS ga_n,
  SUM(prediction = "FA") AS fa_n,
  COUNT(*) AS n
FROM {enwiki_db_name}.page AS talk
INNER JOIN {enwiki_db_name}.page AS article ON
  talk.page_title = article.page_title AND
  article.page_namespace = 0
INNER JOIN {enwiki_db_name}.templatelinks USE INDEX (tl_namespace) ON
  tl_from = talk.page_id
INNER JOIN {datasets_db_name}.monthly_wp10_enwiki AS monthly_aq ON
  article.page_id = monthly_aq.page_id
WHERE
  talk.page_namespace = 1 AND
  tl_namespace = 10 AND
  (
    tl_title = %(project_template)s OR 
    tl_title IN (
      SELECT page.page_title 
      FROM {enwiki_db_name}.pagelinks 
      INNER JOIN {enwiki_db_name}.page ON page_id = pl_from 
      WHERE 
        pl_namespace = 10 AND 
        pl_title = %(project_template)s AND 
        pl_from_namespace = 10 AND 
        page_is_redirect
    )
  )
GROUP BY month;
"""

Database connection management object



In [6]:

    
class DBMonthlyStats:
    
    def __init__(self, config):
        self.conn = pymysql.connect(
            host=config.get('database', 'replica_host'), 
            read_default_file=config.get('database', 'read_default_file'))
        
        self.all_wiki_aggregation_query = ALL_WIKI_AGGREGATION_QUERY.format(
            datasets_db_name=config.get('database', 'datasets_db_name'))
        self.wikiproject_aggregation_query = WIKIPROJECT_AGGREGATION_QUERY.format(
            datasets_db_name=config.get('database', 'datasets_db_name'),
            enwiki_db_name=config.get('database', 'enwiki_db_name'))
    
    def all_wiki_aggregation(self):
        """
        Generate a cross-wiki monthly-aggregate dataset.  
        Returns a cursor that iterates over tuples (rows of the result set).
        """
        with self.conn.cursor() as cursor:
            cursor.execute(self.all_wiki_aggregation_query)
            return cursor
    
    def wikiproject_aggregation(self, project_template):
        """
        Genrerate a wikiproject-specific monthly-aggregate dataset.
        Returns a cursor that iterates over tuples (rows of the result set).
        """
        with self.conn.cursor() as cursor:
            cursor.execute(self.wikiproject_aggregation_query, 
                           {'project_template': project_template})
            return cursor
    

def dump_aggregation(cursor, file):
    headers = [i[0] for i in cursor.description]
    writer = csv.writer(file, delimiter='\t', quoting=csv.QUOTE_NONE, fieldnames=headers)
    writer.writeheader()
    for row in cursor:
        writer.writerow(row)



In [ ]: