In [ ]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sb
import json
import networkx as nx
import statsmodels.formula.api as smf
In [ ]:
state_codings = {1: "Alabama", 2: "Alaska", 4: "Arizona", 5: "Arkansas", 6: "California",
8: "Colorado", 9: "Connecticut", 10: "Delaware", 11: "District Of Columbia",
12: "Florida", 13: "Georgia", 15: "Hawaii", 16: "Idaho", 17: "Illinois",
18: "Indiana", 19: "Iowa", 20: "Kansas", 21: "Kentucky", 22: "Louisiana",
23: "Maine", 24: "Maryland", 25: "Massachusetts", 26: "Michigan",
27: "Minnesota", 28: "Mississippi", 29: "Missouri", 30: "Montana",
31: "Nebraska", 32: "Nevada", 33: "New Hampshire", 34: "New Jersey",
35: "New Mexico", 36: "New York", 37: "North Carolina", 38: "North Dakota",
39: "Ohio", 40: "Oklahoma", 41: "Oregon", 42: "Pennsylvania",
43: "Puerto Rico", 44: "Rhode Island", 45: "South Carolina",
46: "South Dakota", 47: "Tennessee", 48: "Texas", 49: "Utah",
50: "Vermont", 52: "Virgin Islands", 51: "Virginia", 53: "Washington",
54: "West Virginia", 55: "Wisconsin", 56: "Wyoming"}
In [ ]:
counts_df = pd.read_csv('accident_counts.csv',encoding='utf8')
counts_df['WEEKDAY'] = pd.to_datetime(counts_df[['YEAR','MONTH','DAY']]).apply(lambda x:x.weekday())
print('{0:,} rows'.format(len(counts_df)))
counts_df.head()
population_estimates = pd.read_csv('population_estimates.csv',encoding='utf8',index_col=0)
In [ ]:
sb.factorplot(x='HOUR',y='ST_CASE',hue='WEEKDAY',data=counts_df,
aspect=2,order=range(24),palette='nipy_spectral',dodge=.5)
In [ ]:
sb.factorplot(x='MONTH',y='ST_CASE',hue='WEEKDAY',data=counts_df,
aspect=2,order=range(1,13),palette='nipy_spectral',dodge=.5)
One interesting source of exogenous variation Colorado and Washington's legalization of cannabis in 2014. If cannabis usage increased following legalization and this translated into more impaired driving, then there should be an increase in the number of fatal auto accidents in these states after 2014.
In [ ]:
annual_state_counts_df = counts_df.groupby(['STATE','YEAR']).agg({'ST_CASE':np.sum,'DRUNK_DR':np.sum,'FATALS':np.sum}).reset_index()
annual_state_counts_df = pd.merge(annual_state_counts_df,population_estimates,
left_on=['STATE','YEAR'],right_on=['State','Year']
)
annual_state_counts_df = annual_state_counts_df[['STATE','YEAR','ST_CASE','DRUNK_DR','FATALS','Population']]
annual_state_counts_df.head()
Visualize accident statistics for Colorado, Washington, New Mexico (similar to Colorado), and Oregon (similar to Washington).
In [ ]:
_cols = ['ST_CASE','DRUNK_DR','FATALS']
annual_co_counts = annual_state_counts_df[(annual_state_counts_df['STATE'] == "Colorado") & (annual_state_counts_df['YEAR'] > 2010)].set_index('YEAR')[_cols]
annual_wa_counts = annual_state_counts_df[(annual_state_counts_df['STATE'] == "Washington") & (annual_state_counts_df['YEAR'] > 2010)].set_index('YEAR')[_cols]
annual_nm_counts = annual_state_counts_df[(annual_state_counts_df['STATE'] == "New Mexico") & (annual_state_counts_df['YEAR'] > 2010)].set_index('YEAR')[_cols]
annual_or_counts = annual_state_counts_df[(annual_state_counts_df['STATE'] == "Oregon") & (annual_state_counts_df['YEAR'] > 2010)].set_index('YEAR')[_cols]
# Make the figures
f,axs = plt.subplots(3,1,figsize=(10,6),sharex=True)
# Plot the cases
annual_co_counts.plot.line(y='ST_CASE',c='blue',ax=axs[0],legend=False,lw=3)
annual_wa_counts.plot.line(y='ST_CASE',c='green',ax=axs[0],legend=False,lw=3)
annual_nm_counts.plot.line(y='ST_CASE',c='red',ls='--',ax=axs[0],legend=False,lw=3)
annual_or_counts.plot.line(y='ST_CASE',c='orange',ls='--',ax=axs[0],legend=False,lw=3)
axs[0].set_ylabel('Fatal Incidents')
# Plot the drunk driving cases
annual_co_counts.plot.line(y='DRUNK_DR',c='blue',ax=axs[1],legend=False,lw=3)
annual_wa_counts.plot.line(y='DRUNK_DR',c='green',ax=axs[1],legend=False,lw=3)
annual_nm_counts.plot.line(y='DRUNK_DR',c='red',ls='--',ax=axs[1],legend=False,lw=3)
annual_or_counts.plot.line(y='DRUNK_DR',c='orange',ls='--',ax=axs[1],legend=False,lw=3)
axs[1].set_ylabel('Drunk Driving')
# Plot the fatalities
annual_co_counts.plot.line(y='FATALS',c='blue',ax=axs[2],legend=False,lw=3)
annual_wa_counts.plot.line(y='FATALS',c='green',ax=axs[2],legend=False,lw=3)
annual_nm_counts.plot.line(y='FATALS',c='red',ls='--',ax=axs[2],legend=False,lw=3)
annual_or_counts.plot.line(y='FATALS',c='orange',ls='--',ax=axs[2],legend=False,lw=3)
axs[2].set_ylabel('Total Fatalities')
# Plot 2014 legalization
for ax in axs:
ax.axvline(x=2014,c='r')
# Stuff for legend
b = mlines.Line2D([],[],color='blue',label='Colorado',linewidth=3)
g = mlines.Line2D([],[],color='green',label='Washington',linewidth=3)
r = mlines.Line2D([],[],color='red',linestyle='--',label='New Mexico',linewidth=3)
o = mlines.Line2D([],[],color='orange',linestyle='--',label='Oregon',linewidth=3)
axs[2].legend(loc='lower center',ncol=4,handles=[b,g,r,o],fontsize=12,bbox_to_anchor=(.5,-.75))
f.tight_layout()
In [ ]:
annual_state_counts_df['Treated'] = np.where(annual_state_counts_df['STATE'].isin(['Colorado','Washington']),1,0)
annual_state_counts_df['Time'] = np.where(annual_state_counts_df['YEAR'] >= 2014,1,0)
annual_state_counts_df = annual_state_counts_df[annual_state_counts_df['YEAR'] >= 2010]
annual_state_counts_df.query('STATE == "Colorado"')
We'll specify a difference-in-differences design with "Treated" states (who legalized) and "Time" (when they legalized), while controlling for differences in population. The Treated:Time
interaction is the Difference-in-Differences estimate, which is not statistically significant. This suggests legalization did not increase the risk of fatal auto accidents in these states.
In [ ]:
m_cases = smf.ols(formula = 'ST_CASE ~ Treated*Time + Population',
data = annual_state_counts_df).fit()
print(m_cases.summary())
In [ ]:
m_cases = smf.ols(formula = 'FATALS ~ Treated*Time + Population',
data = annual_state_counts_df).fit()
print(m_cases.summary())
In [ ]:
m_cases = smf.ols(formula = 'DRUNK_DR ~ Treated*Time + Population',
data = annual_state_counts_df).fit()
print(m_cases.summary())
There's another exogenous source of variation in this car crash data we can exploit: the unofficial cannabis enthusiast holiday of April 20. If consumption increases on this day compared to a week before or after (April 13 and 27), does this explain changes in fatal auto accidents?
In [ ]:
counts_df.head()
In [ ]:
population_estimates.head()
In [ ]:
# Select only data after 2004 in the month of April
april_df = counts_df.query('MONTH == 4 & YEAR > 2004').set_index(['STATE','HOUR','MONTH','DAY','YEAR'])
# Re-index the data to fill in missing dates
ix = pd.MultiIndex.from_product([state_codings.values(),range(0,24),[4],range(1,31),range(2005,2017)],
names = ['STATE','HOUR','MONTH','DAY','YEAR'])
april_df = april_df.reindex(ix).fillna(0)
april_df.reset_index(inplace=True)
# Add in population data
april_df = pd.merge(april_df,population_estimates,
left_on=['STATE','YEAR'],right_on=['State','Year'])
april_df = april_df[[i for i in april_df.columns if i not in ['Year','State']]]
# Inspect
april_df.head()
In [ ]:
# Calculate whether day is a Friday, Saturday, or Sunday
april_df['Weekday'] = pd.to_datetime(april_df[['YEAR','MONTH','DAY']]).apply(lambda x:x.weekday())
april_df['Weekend'] = np.where(april_df['Weekday'] >= 4,1,0)
# Treated days are on April 20
april_df['Fourtwenty'] = np.where(april_df['DAY'] == 20,1,0)
april_df['Legal'] = np.where((april_df['STATE'].isin(['Colorado','Washington'])) & (april_df['YEAR'] >= 2014),1,0)
# Examine data for a week before and after April 20
april_df = april_df[april_df['DAY'].isin([13,20,27])]
# Inspect Colorado data
april_df.query('STATE == "Colorado"').sort_values(['YEAR','DAY'])
Inspect the main effect of cases on April 20 are compared to the week before and after.
In [ ]:
sb.factorplot(x='DAY',y='ST_CASE',data=april_df,kind='bar',palette=['grey','green','grey'])
Estimate the models. The Fourtwenty
and Legal
dummy variables (and their interaction) capture whether fatal accidents increased on April 20 compared to the week beforehand and afterwards, while controlling for state legality, year, whether it's a weekend, and state population. We do not observe a statistically signidicant increase in the number of incidents, alcohol-involved incidents, and total fatalities on April 20.
In [ ]:
m_cases_420 = smf.ols(formula = 'ST_CASE ~ Fourtwenty*Legal + YEAR + Weekend + Population',
data = april_df).fit()
print(m_cases_420.summary())
In [ ]:
m_drunk_420 = smf.ols(formula = 'DRUNK_DR ~ Fourtwenty*Legal + YEAR + Weekend + Population',
data = april_df).fit()
print(m_drunk_420.summary())
In [ ]:
m_fatal_420 = smf.ols(formula = 'FATALS ~ Fourtwenty*Legal + YEAR + Weekend + Population',
data = april_df).fit()
print(m_fatal_420.summary())
On November 8, 2016, California voters passed Proposition 64 legalizing recreational use of cannabis. On January 1, 2018, recreational sales began. The following two files capture the daily pageview data for the article Cannabis in California as well as the daily pageview for all the other pages it links to.
In [ ]:
ca2016 = pd.read_csv('wikipv_ca_2016.csv',encoding='utf8',parse_dates=['timestamp']).set_index('timestamp')
ca2018 = pd.read_csv('wikipv_ca_2018.csv',encoding='utf8',parse_dates=['timestamp']).set_index('timestamp')
ca2016.head()
Here are two plots of the pageview dynamics for the seed "Cannabis in California" article.
In [ ]:
f,axs = plt.subplots(2,1,figsize=(10,5))
ca2016['Cannabis in California'].plot(ax=axs[0],color='red',lw=3)
ca2018['Cannabis in California'].plot(ax=axs[1],color='blue',lw=3)
axs[0].axvline(pd.Timestamp('2016-11-08'),c='k',ls='--')
axs[1].axvline(pd.Timestamp('2018-01-01'),c='k',ls='--')
for ax in axs:
ax.set_ylabel('Pageviews')
f.tight_layout()
Here is a visualization of the local hyperlink network around "Cannabis in California."
In [ ]:
g = nx.read_gexf('wikilinks_cannabis_in_california.gexf')
f,ax = plt.subplots(1,1,figsize=(10,10))
g_pos = nx.layout.kamada_kawai_layout(g)
nx.draw(G = g,
ax = ax,
pos = g_pos,
with_labels = True,
node_size = [dc*(len(g) - 1)*10 for dc in nx.degree_centrality(g).values()],
font_size = 7.5,
font_weight = 'bold',
node_color = 'tomato',
edge_color = 'grey'
)
What kinds of causal arguments could you make from these pageview data and the hyperlink networks?
In [ ]:
"accidents.csv" is a ~450MB file after concatenating the raw annual data from NHTSA FARS.
In [ ]:
all_accident_df = pd.read_csv('accidents.csv',encoding='utf8',index_col=0)
all_accident_df.head()
The state population estimates for 2010-2017 from the U.S. Census Buureau.
In [ ]:
population_estimates = pd.read_csv('census_pop_estimates.csv')
_cols = [i for i in population_estimates.columns if "POPESTIMATE" in i] + ['NAME']
population_estimates_stacked = population_estimates[_cols].set_index('NAME').unstack().reset_index()
population_estimates_stacked.rename(columns={'level_0':'Year','NAME':'State',0:'Population'},inplace=True)
population_estimates_stacked['Year'] = population_estimates_stacked['Year'].str.slice(-4).astype(int)
population_estimates_stacked = population_estimates_stacked[population_estimates_stacked['State'].isin(state_codings.values())]
population_estimates_stacked.dropna(subset=['Population'],inplace=True)
population_estimates_stacked.to_csv('population_estimates.csv',encoding='utf8')
Groupby-aggregate the data by state, month, day, and year counting the number of cases, alcohol-involved deaths, and total fatalities. Save the data as "accident_counts.csv".
In [ ]:
gb_vars = ['STATE','HOUR','DAY','MONTH','YEAR']
agg_dict = {'ST_CASE':len,'DRUNK_DR':np.sum,'FATALS':np.sum}
counts_df = all_accident_df.groupby(gb_vars).agg(agg_dict).reset_index()
counts_df['STATE'] = counts_df['STATE'].map(state_codings)
counts_df = counts_df.query('YEAR > 1999')
counts_df.to_csv('accident_counts.csv',encoding='utf8',index=False)
counts_df.head()
In [ ]:
from datetime import datetime
import requests, json
from bs4 import BeautifulSoup
from urllib.parse import urlparse, quote, unquote
import networkx as nx
def get_page_outlinks(page_title,lang='en',redirects=1):
"""Takes a page title and returns a list of wiki-links on the page. The
list may contain duplicates and the position in the list is approximately
where the links occurred.
page_title - a string with the title of the page on Wikipedia
lang - a string (typically two letter ISO 639-1 code) for the language
edition, defaults to "en"
redirects - 1 or 0 for whether to follow page redirects, defaults to 1
Returns:
outlinks_per_lang - a dictionary keyed by language returning a dictionary
keyed by page title returning a list of outlinks
"""
# Replace spaces with underscores
page_title = page_title.replace(' ','_')
bad_titles = ['Special:','Wikipedia:','Help:','Template:','Category:','International Standard','Portal:','s:','File:','Digital object identifier','(page does not exist)']
# Get the response from the API for a query
# After passing a page title, the API returns the HTML markup of the current article version within a JSON payload
req = requests.get('https://{2}.wikipedia.org/w/api.php?action=parse&format=json&page={0}&redirects={1}&prop=text&disableeditsection=1&disabletoc=1'.format(page_title,redirects,lang))
# Read the response into JSON to parse and extract the HTML
json_string = json.loads(req.text)
# Initialize an empty list to store the links
outlinks_list = []
if 'parse' in json_string.keys():
page_html = json_string['parse']['text']['*']
# Parse the HTML into Beautiful Soup
soup = BeautifulSoup(page_html,'lxml')
# Remove sections at end
bad_sections = ['See_also','Notes','References','Bibliography','External_links']
sections = soup.find_all('h2')
for section in sections:
if section.span['id'] in bad_sections:
# Clean out the divs
div_siblings = section.find_next_siblings('div')
for sibling in div_siblings:
sibling.clear()
# Clean out the ULs
ul_siblings = section.find_next_siblings('ul')
for sibling in ul_siblings:
sibling.clear()
# Delete tags associated with templates
for tag in soup.find_all('tr'):
tag.replace_with('')
# For each paragraph tag, extract the titles within the links
for para in soup.find_all('p'):
for link in para.find_all('a'):
if link.has_attr('title'):
title = link['title']
# Ignore links that aren't interesting or are redlinks
if all(bad not in title for bad in bad_titles) and 'redlink' not in link['href']:
outlinks_list.append(title)
# For each unordered list, extract the titles within the child links
for unordered_list in soup.find_all('ul'):
for item in unordered_list.find_all('li'):
for link in item.find_all('a'):
if link.has_attr('title'):
title = link['title']
# Ignore links that aren't interesting or are redlinks
if all(bad not in title for bad in bad_titles) and 'redlink' not in link['href']:
outlinks_list.append(title)
return outlinks_list
def get_pageviews(page_title,lang='en',date_from='20150701',date_to=str(datetime.today().date()).replace('-','')):
"""Takes Wikipedia page title and returns a all the various pageview records
page_title - a string with the title of the page on Wikipedia
lang - a string (typically two letter ISO 639-1 code) for the language edition,
defaults to "en"
datefrom - a date string in a YYYYMMDD format, defaults to 20150701
dateto - a date string in a YYYYMMDD format, defaults to today
Returns:
revision_list - a DataFrame indexed by date and multi-columned by agent and access type
"""
quoted_page_title = quote(page_title, safe='')
df_list = []
#for access in ['all-access','desktop','mobile-app','mobile-web']:
#for agent in ['all-agents','user','spider','bot']:
s = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{1}.wikipedia.org/{2}/{3}/{0}/daily/{4}/{5}".format(quoted_page_title,lang,'all-access','user',date_from,date_to)
json_response = requests.get(s).json()
if 'items' in json_response:
df = pd.DataFrame(json_response['items'])
df_list.append(df)
concat_df = pd.concat(df_list)
concat_df['timestamp'] = pd.to_datetime(concat_df['timestamp'],format='%Y%m%d%H')
concat_df = concat_df.set_index(['timestamp','agent','access'])['views'].unstack([1,2]).sort_index(axis=1)
concat_df[('page','page')] = page_title
return concat_df
else:
print("Error on {0}".format(page_title))
pass
Get all of the links from "Cannabis in California", and add the seed article itself.
In [ ]:
ca_links = get_page_outlinks('Cannabis in California') + ['Cannabis in California']
link_d = {}
for l in list(set(ca_links)):
link_d[l] = get_page_outlinks(l)
Make a network object of these hyperlinks among articles.
In [ ]:
g = nx.DiGraph()
seed_edges = [('Cannabis in California',l) for l in link_d['Cannabis in California']]
#g.add_edges_from(seed_edges)
for page,links in link_d.items():
for link in links:
if link in link_d['Cannabis in California']:
g.add_edge(page,link)
print("There are {0:,} nodes and {1:,} edges in the network.".format(g.number_of_nodes(),g.number_of_edges()))
nx.write_gexf(g,'wikilinks_cannabis_in_california.gexf')
Get the pageviews for the articles linking from "Cannabis in California".
In [ ]:
pvs_2016 = {}
pvs_2018 = {}
pvs_2016['Cannabis in California'] = get_pageviews('Cannabis in California',
date_from='20160801',date_to='20170201')
pvs_2018['Cannabis in California'] = get_pageviews('Cannabis in California',
date_from='20171001',date_to='20180301')
for page in list(set(ca_links)):
pvs_2016[page] = get_pageviews(page,date_from='20160801',date_to='20170201')
pvs_2018[page] = get_pageviews(page,date_from='20171001',date_to='20180301')
Define a function cleanup_pageviews
to make a rectangular DataFrame with dates as index, pages as columns, and pageviews as values.
In [ ]:
def cleanup_pageviews(pv_dict):
_df = pd.concat(pv_dict.values())
_df.reset_index(inplace=True)
_df.columns = _df.columns.droplevel(0)
_df.columns = ['timestamp','pageviews','page']
_df = _df.set_index(['timestamp','page']).unstack('page')['pageviews']
return _df
cleanup_pageviews(pvs_2016).to_csv('wikipv_ca_2016.csv',encoding='utf8')
cleanup_pageviews(pvs_2018).to_csv('wikipv_ca_2018.csv',encoding='utf8')