In [2]:
# federal all domains
# retrieved 7 May, 2017
import pandas as pd
may = pd.read_csv('../datasets/may-7-domains-30-days.csv')
In [3]:
jan = pd.read_csv('../datasets/jan-29-domains-30-days.csv')
apr = pd.read_csv('../datasets/apr_29_domains_30_days.csv')
In [4]:
may.head(10)
Out[4]:
In [5]:
jan.head(10)
Out[5]:
In [6]:
jan.tail()
Out[6]:
In [7]:
apr.head(10)
Out[7]:
In [8]:
jan_rank = {}
for i,v in enumerate(jan.domain):
jan_rank[v]=[i+1,jan.loc[i,'visits'],jan.loc[i,'pageviews']]
apr_rank = {}
for i,v in enumerate(apr.domain):
apr_rank[v]=[i+1,apr.loc[i,'visits'],apr.loc[i,'pageviews']]
may_rank = {}
for i,v in enumerate(may.domain):
may_rank[v]=[i+1,may.loc[i,'visits'],may.loc[i,'pageviews']]
jan_df = pd.DataFrame.from_dict(jan_rank,orient='index')
jan_df.columns=['jan_rank','jan_visits','jan_pageviews']
apr_df = pd.DataFrame.from_dict(apr_rank,orient='index')
apr_df.columns=['apr_rank','apr_visits','apr_pageviews']
may_df = pd.DataFrame.from_dict(may_rank,orient='index')
may_df.columns=['may_rank','may_visits','may_pageviews']
In [9]:
apr_df.head()
Out[9]:
In [10]:
usa = jan_df.join(apr_df,how='right')
In [ ]:
In [11]:
usa.head()
Out[11]:
In [12]:
# usa.loc[usa.jan_29.isnull(),'jan_29'] = len(jan_df.jan_29)+1
# usa.loc[usa.may_7.isnull(),'may_7'] = len(may_df.may_7)+1
usa['rank_diff'] = usa.jan_rank-usa.apr_rank
usa['page_diff'] = usa.apr_pageviews-usa.jan_pageviews
usa['visit_diff'] = usa.apr_visits-usa.jan_visits
usa['pct_page_diff'] = usa.apr_pageviews/usa.jan_pageviews-1
usa['pct_visit_diff'] = usa.apr_visits/usa.jan_visits-1
In [13]:
# Increasees in number of pageviews
usa.sort_values(by='page_diff',ascending=False).head(20)
Out[13]:
In [14]:
# Biggest drops in number of pageviews
usa.sort_values(by='page_diff',ascending=True).head(20)
Out[14]:
In [15]:
# Highest percent increases in pages
usa.sort_values(by='pct_page_diff',ascending=False).head(20)
Out[15]:
In [16]:
# Greatest percentage declines in pages
usa.sort_values(by='pct_page_diff',ascending=True).head(20)
Out[16]:
In [17]:
# Increases in total visits
usa.sort_values(by='visit_diff',ascending=False).head(20)
Out[17]:
In [18]:
# Decreases in total visits
usa.sort_values(by='visit_diff',ascending=True).head(20)
Out[18]:
In [19]:
# Highest percent increases in visits
usa.sort_values(by='pct_visit_diff',ascending=False).head(20)
Out[19]:
In [20]:
# Greatest percent decrease in visits
usa.sort_values(by='pct_visit_diff',ascending=True).head(20)
Out[20]:
In [21]:
search = 'search.stopbullying.gov'
usa.loc[search,:]
Out[21]:
In [22]:
# scatter plot of
In [29]:
# taken from http://stackoverflow.com/questions/7404116/defining-the-midpoint-of-a-colormap-in-matplotlib
from mpl_toolkits.axes_grid1 import AxesGrid
def shiftedColorMap(cmap, start=0, midpoint=0.5, stop=1.0, name='shiftedcmap'):
'''
Function to offset the "center" of a colormap. Useful for
data with a negative min and positive max and you want the
middle of the colormap's dynamic range to be at zero
Input
-----
cmap : The matplotlib colormap to be altered
start : Offset from lowest point in the colormap's range.
Defaults to 0.0 (no lower ofset). Should be between
0.0 and `midpoint`.
midpoint : The new center of the colormap. Defaults to
0.5 (no shift). Should be between 0.0 and 1.0. In
general, this should be 1 - vmax/(vmax + abs(vmin))
For example if your data range from -15.0 to +5.0 and
you want the center of the colormap at 0.0, `midpoint`
should be set to 1 - 5/(5 + 15)) or 0.75
stop : Offset from highets point in the colormap's range.
Defaults to 1.0 (no upper ofset). Should be between
`midpoint` and 1.0.
'''
cdict = {
'red': [],
'green': [],
'blue': [],
'alpha': []
}
# regular index to compute the colors
reg_index = np.linspace(start, stop, 257)
# shifted index to match the data
shift_index = np.hstack([
np.linspace(0.0, midpoint, 128, endpoint=False),
np.linspace(midpoint, 1.0, 129, endpoint=True)
])
for ri, si in zip(reg_index, shift_index):
r, g, b, a = cmap(ri)
cdict['red'].append((si, r, r))
cdict['green'].append((si, g, g))
cdict['blue'].append((si, b, b))
cdict['alpha'].append((si, a, a))
newcmap = matplotlib.colors.LinearSegmentedColormap(name, cdict)
plt.register_cmap(cmap=newcmap)
return newcmap
In [55]:
import matplotlib
orig_cmap = matplotlib.cm.PuOr
shifted_cmap = shiftedColorMap(orig_cmap, midpoint=0, name='shifted')
In [60]:
import numpy as np
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10,10))
x = usa.jan_rank
y = usa.apr_rank
ax.scatter(x,y, s=25, zorder=0,alpha =.3, c=usa.rank_diff,cmap=shifted_cmap)
plt.xlim(0,2000)
plt.ylim(0,2000)
lims = [
np.min([ax.get_xlim(), ax.get_ylim()]), # min of both axes
np.max([ax.get_xlim(), ax.get_ylim()]), # max of both axes
]
# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
# fig, ax = plt.subplots(figsize=(10,10))
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)
plt.xlabel('Rank of domain in January',fontsize=14)
plt.ylabel('Rank of domain in April',fontsize=14)
plt.title('How pages changed in popularity over the first 100 days',fontsize=18)
fig.gca().invert_yaxis()
# fig.patch.set_visible(False)
# ax.axis('off')
ax.annotate('Became More Popular', xy=(1450, 250), xytext=(1250, 200),fontsize=16,style='italic')
ax.annotate('Became Less Popular', xy=(1450, 250), xytext=(100, 1800),fontsize=16,style='italic')
# plt.savefig('../austinbrian.github.io/assets/pagerank.png')
Out[60]: