In [13]:
import pandas as pd
import numpy as np

In [2]:
epbias_idx = pd.DataFrame.from_csv('EPbiasbyAnsCountRank.csv')

In [3]:
mx_count = epbias_idx.Ans_count.max()
epbias_idx = epbias_idx.append(pd.DataFrame(
        {'Ans_count':range(0,mx_count),'AnsRank':[0]*len(range(0,mx_count)),"EPbias":[0]*len(range(0,mx_count))}))

In [4]:
#There are deleted users among these votes. To Take into account only active users refer to  ExploratoryPlotsByGroups.ipynb
votes = pd.DataFrame.from_csv('AnsVotes_TSeries.csv', index_col=None)
votes.head()


Out[4]:
Unnamed: 0 AnsId Age QuestionId AnsWordCount AcceptedAnsId QScore QVotes Score Votes ... CumVotes CumScore ReScore QReScore AnsRank ReScore_rank Ans_count Pbias DRank EPbias
0 0 22 0 1 76 8 0 0 4 4 ... 4 4 0.997462 0.000000 1 1 1 1 0 1
1 1 10 0 2 24 11 0 0 11 11 ... 11 11 0.998942 0.000000 1 1 1 1 0 1
2 2 5 0 3 74 5 0 0 10 10 ... 10 10 0.998846 0.000000 1 1 1 1 0 1
3 3 5 1 3 74 5 480 480 1 1 ... 11 11 0.998942 0.997921 1 1 1 1 0 1
4 4 5 2 3 74 5 480 480 1 1 ... 12 12 0.999024 0.997921 1 1 1 1 0 1

5 rows × 22 columns


In [5]:
votes['Norm_DRank'] = votes['DRank']/votes['Ans_count']
votes['Norm_Pos'] = votes['AnsRank']/votes['Ans_count']

In [6]:
import itertools
import sys

def idx_gen(votes):
    new_idx_start = votes.shape[0]
    return itertools.count(new_idx_start)

def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z

def break_ties(df):
    s_df = df.sort_values(by=['AnsRank','Votes'],ascending=[True,False])
    if sum(s_df.groupby(['AnsRank']).count().reset_index(drop=False).Votes>1)>0 :
        tot = df.shape[0]
        s_df.index=range(tot)
        fst_mn = s_df[s_df['AnsRank']==s_df['AnsRank'].shift(-1)].index
        penalties = np.squeeze(np.asarray(np.matrix([np.concatenate([np.zeros(int(e)+1),np.ones(tot-(int(e)+1))]) for e in fst_mn]).sum(axis=0)))
        penalties = pd.Series(penalties)
        s_df.loc[:,'AnsRank'] = s_df.loc[:,'AnsRank'] + penalties
        s_df.index = df.sort_values(by=['AnsRank','Score'],ascending=[True,False]).index
    return s_df
    
## Attributes that can be filled later

def pad_votes(df,a,q,aid_idx,index_gen):
    assert (df.AcceptedAnsId==int(df.AcceptedAnsId.iloc[0])).all(),'There can only be an Accepted answer per Qid-time'
    if aid_idx[q].size==0:
        n_ans = df.shape[0]       
        aid_idx[q] = aid_idx[q].append(df)
        aid_idx[q].loc[:,'Accepted'] = aid_idx[q].AnsId==aid_idx[q].AcceptedAnsId
        aid_idx[q] = aid_idx[q].sort_values(by=['Accepted','Score'],ascending=[False,False])
        aid_idx[q].loc[:,'AnsRank'] = range(1,n_ans+1)
        aid_idx[q].loc[:,'Ans_count'] = n_ans
        
        by_ReScore = aid_idx[q].sort_values(by='ReScore',ascending=False)
        by_ReScore.loc[:,'ReRank'] = range(1,int(by_ReScore.shape[0])+1)
        by_SE = aid_idx[q].sort_values(by=['Accepted','Score'],ascending=[False,False])
        by_SE.loc[:,'SeRank'] = range(1,int(by_SE.shape[0])+1)
        aid_idx[q].loc[:,'DRank'] = by_SE['SeRank'] - by_ReScore['ReRank']
        aid_idx[q].loc[:,'EPbias'] = pd.merge(epbias_idx,aid_idx[q][['AnsRank','Ans_count']],how='inner',
                                              on=['AnsRank','Ans_count'],left_index=True)['EPbias']
                                              
    else:    
        padding_aids = set(aid_idx[q].AnsId).difference(set(df.AnsId))
        df_padding_aids = aid_idx[q][aid_idx[q].AnsId.isin(padding_aids)]
        tot_ans = len(padding_aids) + len(df.AnsId)
        #True ranking (best reconstruction)
        aid_idx[q] = df.append(df_padding_aids)
        aid_idx[q].loc[:,'AcceptedAnsId'] = int(df.AcceptedAnsId.iloc[0])
        aid_idx[q].loc[:,'Accepted'] = aid_idx[q].AnsId==aid_idx[q].AcceptedAnsId
        assert (aid_idx[q].AcceptedAnsId==int(aid_idx[q].AcceptedAnsId.iloc[0])).all(), 'There can only be an Accepted answer per Qid-time'
        aid_idx[q] = aid_idx[q].sort_values(by=['Accepted','Score'],ascending=[False,False])
        #Calculating D-rank
        by_ReScore = aid_idx[q].sort_values(by='ReScore',ascending=False)
        by_ReScore.loc[:,'ReRank'] = range(1,int(by_ReScore.shape[0])+1)
        by_SE = aid_idx[q].sort_values(by=['Accepted','Score'],ascending=[False,False])
        by_SE.loc[:,'SeRank'] = range(1,int(by_SE.shape[0])+1)
        aid_idx[q].loc[:,'DRank'] = by_SE['SeRank'] - by_ReScore['ReRank']
        
        aid_idx[q].loc[:,'AnsRank'] = range(1,tot_ans+1)
        aid_idx[q].loc[:,'Ans_count'] = tot_ans
        aid_idx[q].loc[aid_idx[q].AnsId.isin(padding_aids),['Votes']] = 0
        aid_idx[q].loc[aid_idx[q].AnsId.isin(padding_aids),['Age']] = a
        aid_idx[q][aid_idx[q].AnsId.isin(padding_aids)].index = [index_gen.next() for i in range(len(padding_aids))]
        aid_idx[q].loc[:,'EPbias'] = pd.merge(epbias_idx,aid_idx[q][['AnsRank','Ans_count']],how='inner',
                                              on=['AnsRank','Ans_count'],left_index=True)['EPbias']
    return aid_idx[q]

In [7]:
get_idx = idx_gen(votes)
get_null_row = lambda df,ans_c,age,a_id,q_id: pd.DataFrame(
    {'QuestionId':[q_id],'AnsId':[a_id],'Age':[age-1],'Norm_Pos':[1],'Norm_DRank':[0.0],'Ans_count':[ans_c-1],
        'ReScore':[0.0],'AnsRank':[ans_c-1],'Votes':[0], 'EPbias':[0],
        'Score':[0],'Upvotes':[0],'Downvotes':[0]},index=[get_idx.next()])
#Votes:[0] etc will be shifted anyways
append_null_day = lambda df: get_null_row(df,df.sort_values(by='Age').Ans_count.iloc[0],int(df['Age'].min()),df.AnsId.iloc[0],df.QuestionId.iloc[0]).append(df)
votes = votes.groupby(['AnsId']).apply(append_null_day).reset_index(level=[0],drop=True)
votes.loc[:,'Age'] = votes['Age'] + 1
votes.head()


Out[7]:
AcceptedAnsId Age AnsId AnsRank AnsWordCount Ans_count CumScore CumVotes DRank Downvotes ... QReScore QScore QVotes QuestionId ReScore ReScore_rank Score Unnamed: 0 Upvotes Votes
10564 NaN 0 5 0 NaN 0 NaN NaN NaN 0 ... NaN NaN NaN 3 0.000000 NaN 0 NaN 0 0
2 5 1 5 1 74 1 10 10 0 0 ... 0.000000 0 0 3 0.998846 1 10 2 10 10
3 5 2 5 1 74 1 11 11 0 0 ... 0.997921 480 480 3 0.998942 1 1 3 1 1
4 5 3 5 1 74 1 12 12 0 0 ... 0.997921 480 480 3 0.999024 1 1 4 1 1
5473 5 10 5 1 74 2 13 13 0 0 ... 0.997921 480 480 3 0.999093 1 1 5473 1 1

5 rows × 24 columns


In [8]:
atts_sft = ['Score','EPbias','QuestionId','Age','Norm_Pos','Norm_DRank','Ans_count','ReScore','AnsRank','AcceptedAnsId']
select = lambda df,one_vote=True: df[atts_sft + ['Votes','Upvotes','Downvotes']] if one_vote else df[atts_sft]
shift = lambda df,att : df[att].shift(-1)#.ffill()
shift_select = lambda df: select(df) if df.shape[0]==1 else pd.concat([select(df,one_vote=False),shift(df,'Votes'),shift(df,'Upvotes'),shift(df,'Downvotes')],axis=1)
shift_votes = lambda df: shift_select(df.sort_values(by='Age'))

votes = votes.groupby(['AnsId']).apply(shift_votes).reset_index(level=[0],drop=False)
votes = votes[(votes.Upvotes.notnull())|(votes.Downvotes.notnull())|(votes.Votes.notnull())] #drop last registered day for each ans

In [9]:
votes.loc[:,'AcceptedAnsId']=votes['AcceptedAnsId'].fillna(-1)
def clean_acc(df):
    if df[df.AcceptedAnsId!=-1].size==0 or df[df.AcceptedAnsId==-1].size==0:
        return df
    df.loc[df.AcceptedAnsId==-1,'AcceptedAnsId'] = df[df.AcceptedAnsId!=-1].AcceptedAnsId.iloc[0]
    return df
votes = votes.groupby(['QuestionId','Age']).apply(clean_acc).reset_index(drop=True)

In [10]:
#Test if accepted ans is homogeneous
homogeneous = votes.groupby(['QuestionId',
                             'Age']).apply(
    lambda df: (int(df.AcceptedAnsId.iloc[0])==df.AcceptedAnsId).all()).reset_index(drop=True)

homogeneous.all()


Out[10]:
True

In [11]:
from collections import defaultdict
#VERY expensive transformation
padded_votes = pd.DataFrame()
genidx = idx_gen(votes)
ans_index = defaultdict(lambda: pd.DataFrame()) #Qid,AnsId,Position
for q,g_q in votes.groupby(['QuestionId']):
    for a,g_a in g_q.sort_values(by='Age').groupby(['Age']):
        padded_votes = padded_votes.append(pad_votes(g_a,a,q,ans_index,genidx))

padded_votes.head(10)


Out[11]:
Accepted AcceptedAnsId Age AnsId AnsRank Ans_count DRank Downvotes EPbias Norm_DRank Norm_Pos QuestionId ReScore Score Upvotes Votes
22 False -1 0 8 1 5 0 0 0.237991 0 1.0 1 0.000000 0 2 2
125 False -1 0 29 2 5 0 0 0.207424 0 1.0 1 0.000000 0 2 2
171 False -1 0 37 3 5 0 0 0.170306 0 1.0 1 0.000000 0 1 1
99 False -1 0 22 4 5 0 0 0.205240 0 1.0 1 0.000000 0 4 4
141 False -1 0 31 5 5 0 0 0.179039 0 1.0 1 0.000000 0 1 1
23 True 8 1 8 1 5 -1 0 0.237991 0 1.0 1 0.995769 2 3 3
100 False 8 1 22 2 5 1 0 0.207424 0 1.0 1 0.997462 4 2 2
126 False 8 1 29 3 5 0 0 0.170306 0 1.0 1 0.995769 2 1 1
142 False 8 1 31 4 5 0 0 0.205240 0 1.0 1 0.993654 1 1 1
172 False 8 1 37 5 5 0 0 0.179039 0 0.8 1 0.993654 1 3 3

In [14]:
votes = padded_votes
votes.loc[:,"Norm_DRank"] = votes['DRank']/votes['Ans_count']
votes.loc[:,"Norm_Pos"] = votes['AnsRank']/votes['Ans_count']
votes.loc[:,'Norm_Pos_2'] = np.square(votes['Norm_Pos'])

In [15]:
## If it is zero you can call it a day
get_comp = lambda df: int(df.shape[0])>1 
tt = votes.groupby(['QuestionId','Age','AnsRank']).apply(get_comp).reset_index(drop=True)
sum(tt)


Out[15]:
0

In [16]:
votes.to_csv('VotesRaw.csv')

Prepare for nnet


In [16]:
qt_idx = votes.groupby(['QuestionId','Age']).count().reset_index(level=[0,1],drop=False)[['QuestionId','Age']]
qt_idx.to_csv('Qt_idx.csv')

In [17]:
# nnet to predict for a fixed nbr of choices
for nb_choices_nnet in range(2,7):
    for_nnet = pd.DataFrame()
    for k,g in votes[votes.Ans_count==nb_choices_nnet].groupby(['Votes']):
        if k>1:
            for_nnet = for_nnet.append([g]*(int(k)),ignore_index=True)
        else:
            for_nnet = for_nnet.append(g,ignore_index=True)
    for_nnet.loc[for_nnet.Votes>1,'Votes']=1
    for_nnet = for_nnet[['QuestionId','AnsRank','Ans_count','AnsId','Age','Votes','ReScore','Norm_Pos','Norm_Pos_2','Norm_DRank','EPbias']]
    for_nnet = for_nnet[for_nnet.Votes>0]
    for_nnet = pd.merge(for_nnet,qt_idx.reset_index(drop=False),how='left',on=['QuestionId','Age'])
    for_nnet.to_csv('Qtnnet%d.csv'%nb_choices_nnet)

In [18]:
for_nnet = pd.DataFrame()
for k,g in votes.groupby(['Votes']):
    if k>1:
        for_nnet = for_nnet.append([g]*(int(k)),ignore_index=True)
    else:
        for_nnet = for_nnet.append(g,ignore_index=True)
    for_nnet.loc[for_nnet.Votes>1,'Votes']=1
    for_nnet = for_nnet[['QuestionId','AnsRank','Ans_count','AnsId','Age','Votes','ReScore','Norm_Pos','Norm_Pos_2','Norm_DRank','EPbias']]
    for_nnet = for_nnet[for_nnet.Votes>0]
    for_nnet = pd.merge(for_nnet,qt_idx.reset_index(drop=False),how='left',on=['QuestionId','Age'])
    for_nnet.to_csv('QtnnetAll.csv')

In [ ]: