In [1]:
import pandas as pd

In [2]:
questions = pd.DataFrame.from_csv('question_simple.csv', index_col=None)
questions.head()


Out[2]:
QuestionId QuestionCreation QuestionLastActivity AcceptedAnsId AcceptedDate QScore QVotes
0 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 25 50
1 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 25 50
2 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 25 50
3 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 25 50
4 2 2011-10-25T19:44:21.647 2014-12-16T20:44:35.923 11 2011-10-26T00:00:00.000 18 40

In [3]:
a_questions = pd.DataFrame.from_csv('question_votes.csv', index_col=None)
a_questions[a_questions.AcceptedAnsId.notnull()].head()


Out[3]:
QuestionId QuestionCreation QuestionLastActivity AcceptedAnsId AcceptedDate VoteType QVoteCreation
0 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2 2011-10-25T19:44:21.590
1 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2 2011-10-25T19:44:21.590
2 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2 2011-10-25T19:44:21.590
3 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2 2011-10-25T19:44:21.590
4 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2 2011-10-25T19:44:21.590

In [4]:
get_votes_qv = lambda df: pd.Series((df.VoteType==2).cumsum() + (df.VoteType==3).cumsum(),name='QVotes')
get_score_qv = lambda df: pd.Series((df.VoteType==2).cumsum() - (df.VoteType==3).cumsum(),name='QScore')

predictors_qvotes = ['QuestionId','QuestionCreation','QuestionLastActivity','AcceptedAnsId','AcceptedDate','QVoteCreation']
f_q = lambda df: pd.concat([df[cname] for cname in df.columns.values.tolist() if cname in predictors_qvotes]+[get_score_qv(df),get_votes_qv(df)],axis=1)
a_questions = a_questions.sort_values(by='QVoteCreation').groupby(['QuestionId']).apply(f_q)
a_questions.head(10)


Out[4]:
QuestionId QuestionCreation QuestionLastActivity AcceptedAnsId AcceptedDate QVoteCreation QScore QVotes
0 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2011-10-25T19:44:21.590 1 1
60045 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2011-10-25T19:44:21.590 2 2
60044 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2011-10-25T19:44:21.590 3 3
60043 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2011-10-25T19:44:21.590 4 4
60042 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2011-10-25T19:44:21.590 5 5
60041 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2011-10-25T19:44:21.590 6 6
60040 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2011-10-25T19:44:21.590 7 7
60039 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2011-10-25T19:44:21.590 8 8
60038 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2011-10-25T19:44:21.590 9 9
60037 1 2011-10-25T19:44:21.590 2015-07-13T23:45:46.583 8 2011-10-28T00:00:00.000 2011-10-25T19:44:21.590 10 10

In [5]:
a_votes = pd.DataFrame.from_csv('votes-answers.csv', index_col=None)
a_votes = pd.merge(a_votes, a_questions, how='inner', on=['QuestionId'],suffixes=['_v', '_q'])
a_votes.head()


Out[5]:
VoteId VoteCreation AnsCreation VoteType AnsId QuestionId AnsWordCount QuestionCreation QuestionLastActivity AcceptedAnsId AcceptedDate QVoteCreation QScore QVotes
0 3 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 2015-12-30T05:21:40.827 5 2011-10-25T00:00:00.000 2011-10-25T19:44:55.973 1 1
1 3 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 2015-12-30T05:21:40.827 5 2011-10-25T00:00:00.000 2011-10-25T19:44:55.973 2 2
2 3 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 2015-12-30T05:21:40.827 5 2011-10-25T00:00:00.000 2011-10-25T19:44:55.973 3 3
3 3 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 2015-12-30T05:21:40.827 5 2011-10-25T00:00:00.000 2011-10-25T19:44:55.973 4 4
4 3 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 2015-12-30T05:21:40.827 5 2011-10-25T00:00:00.000 2011-10-25T19:44:55.973 5 5

In [6]:
predictors_raw_votans =['VoteId','VoteCreation','AnsCreation','VoteType','AnsId','QuestionId','AnsWordCount','QuestionCreation','AcceptedAnsId','AcceptedDate']

valid_qavotes = lambda df: df[df.VoteCreation>=df.QVoteCreation]
#Use twice valid_qavotes, could use once to improve efficiency, but check correctness of index selection
get_max_qv = lambda df: valid_qavotes(df).loc[valid_qavotes(df).QVotes.idxmax(),['QScore','QVotes']].squeeze()
get_latest_qv = lambda df : pd.Series([0,0],index=['QScore','QVotes']) if not (df.VoteCreation>=df.QVoteCreation).any() else get_max_qv(df)
get_head = lambda df: [df[cname].iloc[0] for cname in df.columns.values.tolist() if cname in predictors_raw_votans]
get_qv = lambda df : pd.Series(get_head(df),index=predictors_raw_votans).append(get_latest_qv(df)).to_frame()

a_votes = a_votes.sort_values(by='VoteCreation').groupby(['VoteId']).apply(get_qv).unstack(level=-1).reset_index(level=[0],drop=True)
a_votes.drop(a_votes.columns[[0]], axis=1, inplace=True)
a_votes.columns = a_votes.columns.droplevel()
a_votes.head()


Out[6]:
VoteCreation AnsCreation VoteType AnsId QuestionId AnsWordCount QuestionCreation AcceptedAnsId AcceptedDate QScore QVotes
0 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 5 2011-10-25T00:00:00.000 0 0
1 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 5 2011-10-25T00:00:00.000 0 0
2 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 5 2011-10-25T00:00:00.000 0 0
3 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 5 2011-10-25T00:00:00.000 0 0
4 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 5 2011-10-25T00:00:00.000 0 0

In [7]:
date_placeholder = '2016-07-20T00:00:00.000' #Date After Data Set Collection
#a_votes.loc[a_votes.AcceptedDate == 'None','AcceptedDate'] = pd.to_datetime(date_placeholder)
a_votes['AcceptedDate'].fillna(pd.to_datetime(date_placeholder),inplace=True) 
a_votes['AcceptedAge'] = (pd.to_datetime(a_votes.AcceptedDate,format='%Y-%m-%d %H:%M:%S.%f')
                  -pd.to_datetime(a_votes.QuestionCreation,format='%Y-%m-%d %H:%M:%S.%f')).apply(lambda x: x.astype('timedelta64[D]').item().days)
a_votes['AcceptedAge'] = a_votes['AcceptedAge'] + 1

a_votes.loc[a_votes.AcceptedDate == pd.to_datetime(date_placeholder), 'AcceptedAge'] = -1
a_votes['Age'] = (pd.to_datetime(a_votes.VoteCreation,format='%Y-%m-%d %H:%M:%S.%f')
                  -pd.to_datetime(a_votes.QuestionCreation,format='%Y-%m-%d %H:%M:%S.%f')).apply(lambda x: x.astype('timedelta64[D]').item().days)
a_votes['Age'] = a_votes['Age'] + 1
a_votes.head()


Out[7]:
VoteCreation AnsCreation VoteType AnsId QuestionId AnsWordCount QuestionCreation AcceptedAnsId AcceptedDate QScore QVotes AcceptedAge Age
0 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 5 2011-10-25T00:00:00.000 0 0 0 0
1 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 5 2011-10-25T00:00:00.000 0 0 0 0
2 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 5 2011-10-25T00:00:00.000 0 0 0 0
3 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 5 2011-10-25T00:00:00.000 0 0 0 0
4 2011-10-25T00:00:00.000 2011-10-25T19:47:31.947 2 5 3 74 2011-10-25T19:44:55.973 5 2011-10-25T00:00:00.000 0 0 0 0

In [8]:
a_votes.drop(a_votes.columns[[0, 1, 6, 8]], axis=1, inplace=True)

In [9]:
get_score = lambda df: sum(df.VoteType==2) - sum(df.VoteType==3)
get_votes = lambda df: sum(df.VoteType==2) + sum(df.VoteType==3)

predictors = ['QuestionId','AnsWordCount','AcceptedAnsId','AcceptedAge','QScore',
              'QVotes','Score','Votes','Upvotes','Downvotes']
f = lambda df: pd.Series([df.QuestionId.iloc[0],df.AnsWordCount.iloc[0],df.AcceptedAnsId.iloc[0],df.AcceptedAge.iloc[0],
                          df.QScore.iloc[0],df.QVotes.iloc[0],get_score(df),get_votes(df),sum(df.VoteType==2),sum(df.VoteType==3)],index = predictors)
a_groups = a_votes.sort_values(by='Age').groupby(['AnsId','Age']).apply(f)
a_groups = a_groups.reset_index(level=[0,1],drop=False)
a_groups.head()


Out[9]:
AnsId Age QuestionId AnsWordCount AcceptedAnsId AcceptedAge QScore QVotes Score Votes Upvotes Downvotes
0 5 0 3 74 5 0 0 0 10 10 10 0
1 5 1 3 74 5 0 480 480 1 1 1 0
2 5 2 3 74 5 0 480 480 1 1 1 0
3 5 9 3 74 5 0 480 480 1 1 1 0
4 5 16 3 74 5 0 480 480 1 1 1 0

In [10]:
cum_votes = lambda df: pd.Series(df['Votes'].cumsum(),name='CumVotes')
cum_score = lambda df: pd.Series(df['Score'].cumsum(),name='CumScore')

get_cumulative =lambda df: pd.concat([df[cname] for cname in df.columns.values.tolist()] + [cum_votes(df),cum_score(df)],axis=1)
ff = lambda df: get_cumulative(df.sort_values(by='Age'))
a_groups_c = a_groups.groupby(['AnsId']).apply(ff).reset_index(level=[0],drop=True)
a_groups_c.head()


Out[10]:
AnsId Age QuestionId AnsWordCount AcceptedAnsId AcceptedAge QScore QVotes Score Votes Upvotes Downvotes CumVotes CumScore
0 5 0 3 74 5 0 0 0 10 10 10 0 10 10
1 5 1 3 74 5 0 480 480 1 1 1 0 11 11
2 5 2 3 74 5 0 480 480 1 1 1 0 12 12
3 5 9 3 74 5 0 480 480 1 1 1 0 13 13
4 5 16 3 74 5 0 480 480 1 1 1 0 14 14

In [11]:
prior_quality = float(a_groups_c['Upvotes'].sum())/(a_groups_c['Upvotes'].sum() + a_groups_c['Downvotes'].sum())
a_groups_c['ReScore'] = (a_groups_c['CumScore']+prior_quality)/(a_groups_c['CumVotes']+1.0)
a_groups_c['QReScore'] = a_groups_c['QScore']/(a_groups_c['QVotes']+1.0)
a_groups_c.head()


Out[11]:
AnsId Age QuestionId AnsWordCount AcceptedAnsId AcceptedAge QScore QVotes Score Votes Upvotes Downvotes CumVotes CumScore ReScore QReScore
0 5 0 3 74 5 0 0 0 10 10 10 0 10 10 0.998846 0.000000
1 5 1 3 74 5 0 480 480 1 1 1 0 11 11 0.998942 0.997921
2 5 2 3 74 5 0 480 480 1 1 1 0 12 12 0.999024 0.997921
3 5 9 3 74 5 0 480 480 1 1 1 0 13 13 0.999093 0.997921
4 5 16 3 74 5 0 480 480 1 1 1 0 14 14 0.999154 0.997921

In [12]:
votes_com_f = a_groups_c

In [13]:
from itertools import izip
def rank_ans(df,score_only,re_score):
    rk_name = "ReScore_rank" if re_score else "AnsRank"
    def rank_iter():
        cache = {}
        accepted = 0
        for row in df.itertuples():
            if re_score:
                cache[row.AnsId] = row.ReScore
            else :
                cache[row.AnsId] = row.Score
            # rank, nb_ans
            if (not score_only) and row.AcceptedAge>-1 and (row.AnsId == row.AcceptedAnsId) and row.Age >=row.AcceptedAge:
                accepted = 1
                if row.AnsId in cache:
                    del cache[row.AnsId]
                yield (1,len(cache)+accepted,row.Index)
            else :
                rank = sorted(cache, key= lambda k:cache[k],reverse=True).index(row.AnsId) + 1 + accepted
                yield (rank,len(cache)+accepted,row.Index)
            
    ranks, ans_counts, indices = izip(*list(rank_iter())) #TODO: optimize for the future
    return [pd.Series(ranks,name=rk_name, index=indices), pd.Series(ans_counts,name="Ans_count", index=indices)]

predictors = ['QuestionId','AnsId','AnsWordCount','AcceptedAnsId','Age',
              'Score','Votes','Upvotes','Downvotes','CumScore','CumVotes','QScore'
              ,'QVotes','ReScore','QReScore','AnsRank','ReScore_rank']
get_ranks = lambda df,score_only=False,re_score=False: pd.concat(
    [df[cname] for cname in df.columns.values.tolist() if cname in predictors] + rank_ans(df,score_only,re_score),axis=1)
sort_age_score = lambda df: df.sort_values(by=['Age','Score'],ascending=[True,False])

In [14]:
votes_com_f = votes_com_f.groupby(['QuestionId']).apply(
    lambda df: get_ranks(sort_age_score(df))).reset_index(drop=True)
votes_com_f = votes_com_f.groupby(['QuestionId']).apply(
    lambda df: get_ranks(sort_age_score(df),score_only=True,re_score=True)).reset_index(drop=True)

votes_com_f['Pbias'] = 1.0/votes_com_f['AnsRank']
votes_com_f['DRank'] = votes_com_f['AnsRank'] - votes_com_f['ReScore_rank']
votes_com_f.head()


Out[14]:
AnsId Age QuestionId AnsWordCount AcceptedAnsId QScore QVotes Score Votes Upvotes Downvotes CumVotes CumScore ReScore QReScore AnsRank ReScore_rank Ans_count Pbias DRank
0 22 0 1 76 8 0 0 4 4 4 0 4 4 0.997462 0 1 1 1 1.000000 0
1 8 0 1 108 8 0 0 2 2 2 0 2 2 0.995769 0 2 2 2 0.500000 0
2 29 0 1 106 8 0 0 2 2 2 0 2 2 0.995769 0 3 3 3 0.333333 0
3 31 0 1 130 8 0 0 1 1 1 0 1 1 0.993654 0 4 4 4 0.250000 0
4 37 0 1 82 8 0 0 1 1 1 0 1 1 0.993654 0 4 4 5 0.250000 0

In [15]:
#AnsRank and Ans_count define unique EPbias
sum_by_rank = lambda df: df.groupby('AnsRank').apply(
    lambda df: pd.Series([df.Votes.sum()],name='EPbias').to_frame()).unstack(level=-1).reset_index(level=0,drop=False)
get_ratio = lambda df: sum_by_rank(df).EPbias/(sum_by_rank(df).EPbias.sum())
ratio_per_rank = lambda df: pd.concat([sum_by_rank(df).AnsRank, get_ratio(df)],axis=1)
get_position_bias = lambda df: pd.merge(df,ratio_per_rank(df),how='inner',on=['AnsRank'])

votes = votes_com_f.groupby(['Ans_count']).apply(get_position_bias).reset_index(level=[0,1],drop=True)
votes.columns.values[-1] = "EPbias"
votes.head()


Out[15]:
AnsId Age QuestionId AnsWordCount AcceptedAnsId QScore QVotes Score Votes Upvotes ... CumVotes CumScore ReScore QReScore AnsRank ReScore_rank Ans_count Pbias DRank EPbias
0 22 0 1 76 8 0 0 4 4 4 ... 4 4 0.997462 0.000000 1 1 1 1 0 1
1 10 0 2 24 11 0 0 11 11 11 ... 11 11 0.998942 0.000000 1 1 1 1 0 1
2 5 0 3 74 5 0 0 10 10 10 ... 10 10 0.998846 0.000000 1 1 1 1 0 1
3 5 1 3 74 5 480 480 1 1 1 ... 11 11 0.998942 0.997921 1 1 1 1 0 1
4 5 2 3 74 5 480 480 1 1 1 ... 12 12 0.999024 0.997921 1 1 1 1 0 1

5 rows × 21 columns


In [16]:
test_epbias = votes.groupby(['Ans_count','AnsRank']).first().reset_index(
    level=[0,1],drop=False)[['Ans_count','AnsRank','EPbias']]
test_epbias.to_csv('EPbiasbyAnsCountRank.csv')

In [17]:
#GET NULL say yay!!!
tt = votes.groupby(['AnsId','Age']).Votes.count().reset_index(level=[0,1],drop=False)
tt[tt.Votes>2].head()


Out[17]:
AnsId Age Votes

In [18]:
votes.to_csv(path_or_buf='AnsVotes_TSeries.csv')

In [ ]:


In [ ]:


In [ ]:


In [ ]: