In [26]:
%%html
<script type="text/javascript">
show=true;
function toggle(){
if (show){$('div.input').hide();}else{$('div.input').show();}
show = !show}
</script>
<h2><a href="javascript:toggle()" target="_self">Click to toggle code input</a></h2>
In [1]:
%pylab inline
import numpy as np
from numpy.random import randn
import pandas as pd
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import Series, DataFrame
Beautiful Soup, I simply copied the data to the clipboard.But for reproducibility purposes, I immediately saved the data to a .csv file with the code below.
institutes = pd.read_clipboard(header=None, sep='\s{2,}', index_col='Institute',
names=['Institute', 'Users', 'Votes', 'Up', 'Down',
'Comments', 'Posts','DateStarted'])
institutes.to_csv('data/institutes.csv')
In [3]:
institutes = pd.read_csv('data/institutes.csv', na_values='-', parse_dates=['DateStarted'])
institutes.tail()
Out[3]:
In [4]:
#institutes['Votes'].cumsum().plot(title='Cumulative distribution')
institutes['Rank'] = institutes.Votes.rank(method='first', ascending=False)
In [5]:
cumsum_normalized = institutes.Votes.cumsum().div(institutes.Votes.cumsum().max())
cumsum_normalized.plot(title='Cumulative distribution of VoxCharta votes')
UT_id = institutes.Institute == 'UT Austin'
plt.scatter(institutes['Rank'][UT_id], cumsum_normalized[UT_id], s=50, c='r', marker='o')
plt.annotate('UTexas',
xy=(institutes['Rank'][UT_id], cumsum_normalized[UT_id]),
textcoords='offset points',
fontsize=16.0,
arrowprops=dict(arrowstyle="fancy", #linestyle="dashed",
color="0.5",
shrinkB=9,
connectionstyle="arc3,rad=0.3",
),
)
#plt.add_at("mutate", loc=2)
Out[5]:
In [6]:
ids = (cumsum_normalized < 0.5)
count = len(cumsum_normalized)
statement = "{0} institutions are responsible for half of all votes out of {1} institutions"
print statement.format(np.sum(ids), count)
print "They are: "
institutes.Institute[ids]
Out[6]:
In [7]:
users = pd.read_csv('data/users.csv', parse_dates=['DateUser'], index_col='User')
users.tail()
Out[7]:
In [28]:
users.count() #5169, that's a lot of users!
Out[28]:
In [8]:
users['Rank'] = users.Votes.rank(method='first', ascending=False)
In [9]:
cumsum_norm_users = users.Votes.cumsum().div(users.Votes.cumsum().max())
cumsum_norm_users.plot(title='Cumulative distribution of VoxCharta votes by user')
gully_id = (users.index == 'gully')
plt.scatter(users['Rank'][gully_id], cumsum_norm_users[gully_id], s=50, c='r', marker='o')
plt.annotate('gully',
xy=(users['Rank'][gully_id], cumsum_norm_users[gully_id]),
textcoords='offset points',
fontsize=16.0,
arrowprops=dict(arrowstyle="fancy", #linestyle="dashed",
color="0.5",
shrinkB=9,
connectionstyle="arc3,rad=0.3",
))
Out[9]:
I want to emphasize that it is not necessarily virtuous to have voted for a paper. This analysis and the assignment of 'rank' has no implied significance to the user's scientific merits- it simply is a reflection of how many times he or she has voted for something on VoxCharta, that's it. Indeed, it might be annoying for someone to vote all the time for no reason whatsoever, having never read the paper.
In [10]:
UTexas = (users.Affiliation == 'UT Austin')
print "There are {} users at UT Austin".format(np.sum(UTexas))
In [11]:
UTexas_users = users[UTexas]
UTexas_users.head()
Out[11]:
In [12]:
cumsum_norm_UT = UTexas_users.Votes.cumsum().div(UTexas_users.Votes.cumsum().max())
cumsum_norm_UT.plot(title='Cumulative distribution of VoxCharta votes at UTexas')
UTexas_users['Cumulative'] = cumsum_norm_UT
In [13]:
never_voted = (UTexas_users.Votes == 0)
print "{} users out of {} at UT Austin have never voted".format(np.sum(never_voted), np.sum(UTexas))
I'm curious to see how the UTexas CDF compares to other institutions. This is a bit hard to compare since there are different total numbers of users, so we might have to consider normalizing the number of voters (i.e. the $x-$axis) too. What we might do instead is use a coarse estimator, like the number of votes of the highest-voting user of each institution. Let's come back to that.
The usual clipboad gimick.
mgs = pd.read_clipboard(header=None, sep='\s{2,}', index_col='Date',
names=['Date', 'Title'])
mgs.to_csv('data/mgs.csv')
In [20]:
mgs = pd.read_csv('data/mgs.csv', parse_dates=['Date'], index_col='Date')
mgs['count'] = mgs.count(axis=1)
mgs.head()
Out[20]:
In [22]:
grouped = mgs.groupby(level=0)
gsum = grouped.sum()
In [23]:
mgs_byday = gsum.resample('D', how=sum)
mgs_byday.fillna(0, inplace=True)
mgs_bymonth = gsum.resample('M', how=sum)
mgs_bymonth.fillna(0, inplace=True)
In [24]:
mgs_bymonth.plot(title='MGS voting record by month')
Out[24]:
In [25]:
mgs_byday.cumsum().plot(title='MGS cumulative votes as a function of time')
Out[25]:
Insights: My voting frequency is irregular. Notably there were big patches of time when I did not vote. Lately I have been voting a lot.
In [ ]: