In [1]:
import numpy as np
np.random.seed(2345)
import pandas as pd
In [2]:
questions = pd.read_csv("./input/Questions.csv", encoding='latin1')
answers = pd.read_csv("./input/Answers.csv", encoding='latin1')
tags = pd.read_csv("./input/Tags.csv", encoding='latin1')
In [3]:
tags.head()
Out[3]:
In [4]:
answers.head()
Out[4]:
In [5]:
questions.head()
Out[5]:
In [6]:
questions.info() # Id Title Body are used for constructing the dataset
In [7]:
answers.info() # OwnerUserId ParentId IsAcceptedAnswer are used for constructing the dataset, maybe score can be used in the future
In [8]:
tags.info() # Id and Tag are useful
先处理questions,数据清洗
In [9]:
# extract all the code part
temp_code = questions['Body'].str.extractall(r'(<code>[^<]+</code>)')
In [10]:
temp_code.head()
Out[10]:
In [11]:
# unstack and convert into a single column for cleaning
test = temp_code.unstack('match')
test.columns = test.columns.droplevel()
# put all columns together
code = pd.DataFrame(test.apply(lambda x: x.str.cat(), axis=1,reduce=True))
# rename
code.columns = ['CodeBody']
# remove the html tags finally
code['CodeBody'] = code['CodeBody'].str.replace(r'<[^>]+>|\n|\r',' ')
In [12]:
# remove the code part from questions
body = questions['Body'].str.replace(r'<code>[^<]+</code>',' ')
# build up the question part from questions
questions['QuestionBody'] = body.str.replace(r"<[^>]+>|\n|\r", " ")
In [13]:
# Join the codebody by index
questions = questions.join(code)
# final cleaned dataset
questions_final = questions.drop('Body',axis=1)
In [14]:
questions_final.head()
Out[14]:
In [15]:
questions_final.info()
再处理tags,将其拼在questions后
In [16]:
tags = tags[tags.Tag.notnull()]
In [17]:
tagsByquestion = tags.groupby('Id',as_index=False).agg(lambda x: ' '.join(x))
In [18]:
tagsByquestion.head()
Out[18]:
In [19]:
tagsByquestion.info()
In [20]:
questions_tags = questions_final.merge(tagsByquestion,on='Id',how='left')
In [21]:
questions_tags.head()
Out[21]:
In [22]:
questions_tags.info()
至此,Questions和tags处理并合并完成。
In [23]:
questions_tags = questions_tags.drop(['OwnerUserId','CreationDate','Score'], axis=1)
In [24]:
questions_tags.head()
Out[24]:
In [25]:
questions_tags.info()
再处理answers,找出有最佳回答的问题
In [26]:
accepted_answers = answers[answers.IsAcceptedAnswer == True]
In [27]:
accepted_answers.head()
Out[27]:
In [28]:
accepted_answers.info()
In [29]:
%matplotlib inline
In [30]:
# Let's compute the number of best answers the experts have proposed:
accepted_answers["OwnerUserId"].value_counts().head(10).plot(kind="barh")
Out[30]:
In [31]:
accepted_answers["OwnerUserId"].value_counts().head(10)
Out[31]:
In [32]:
accepted_answers = accepted_answers.drop(['Id','CreationDate','Score','IsAcceptedAnswer' ,'Body'], axis=1)
In [33]:
col_mapping = {'OwnerUserId' : 'ExpertId',
'ParentId' : 'Id'}
accepted_answers = accepted_answers.rename(columns=col_mapping, copy = False)
In [34]:
accepted_answers.head()
Out[34]:
In [35]:
accepted_answers.info()
In [36]:
accepted_answers = accepted_answers.dropna()
In [37]:
accepted_answers.info()
In [38]:
unique_expert = accepted_answers.ExpertId.unique()
unique_expert.shape
Out[38]:
In [39]:
count = accepted_answers['ExpertId'].value_counts()
In [40]:
count_df = pd.DataFrame(count)
In [41]:
count_df = count_df.reset_index()
In [42]:
col_mapping2 = {'ExpertId' : 'Count',
'index' : 'ExpertId'}
count_df = count_df.rename(columns=col_mapping2, copy = False)
In [43]:
count_df.head()
Out[43]:
In [44]:
count_df.info()
整合数据
In [45]:
questions_answers = questions_tags.merge(accepted_answers,on='Id',how='right')
In [46]:
questions_answers.head()
Out[46]:
In [47]:
questions_answers.info()
In [48]:
experts_count = questions_answers.merge(count_df, on='ExpertId', how='left')
In [49]:
experts_count.head()
Out[49]:
In [50]:
experts_count.info()
In [51]:
experts_count.columns
Out[51]:
In [52]:
experts_count = experts_count.reindex(columns=[u'Id', u'Title', u'QuestionBody', u'CodeBody', u'Tag', u'ExpertId',
u'Count', u'Label'])
In [53]:
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()
experts_count['Label']=label.fit_transform(experts_count['ExpertId'])
In [54]:
experts_count.head()
Out[54]:
In [55]:
max_lable = np.max(experts_count.Label)
min_lable = np.min(experts_count.Label)
print (max_lable)
print (min_lable)
In [56]:
experts_count.info()
In [57]:
import pickle
pickle.dump(experts_count,open('experts_count.pkl','wb'))
In [1]:
import numpy as np
import pandas as pd
experts_count=pd.read_pickle('experts_count.pkl')
experts_count=experts_count.fillna('none')
In [2]:
experts_count.columns
Out[2]:
In [3]:
train1 = experts_count[:80000][experts_count.Count>10]
test1 = experts_count[80000:]
In [4]:
train1_unique_expert = train1.ExpertId.unique()
print ("number of experts in train set: %r " % train1_unique_expert.shape)
test1_unique_expert = test1.ExpertId.unique()
print ("number of experts in test set: %r" % test1_unique_expert.shape)
print ("type : %r" % type(test1_unique_expert))
l = np.intersect1d(train1_unique_expert,test1_unique_expert)
print ("the number of experts both in train set and test set: %r" % l.shape)
In [5]:
title_train1 = train1.drop(['Id','QuestionBody','CodeBody','Tag', \
'ExpertId','Count',],axis=1)
title_test1 = test1.drop(['Id','QuestionBody','CodeBody','Tag', \
'ExpertId','Count',],axis=1)
In [6]:
title_train1 = title_train1.set_index('Label')
title_test1 = title_test1.set_index('Label')
In [7]:
title_train1.info()
In [8]:
title_test1.info()
In [9]:
# 通过fastText的测试类别我们得出,测试集中7404个问题中有5176个属于训练集中存在的专家
5176/7404.0
Out[9]:
In [10]:
title_train1.to_csv('title_train1', encoding='utf-8')
title_test1.to_csv('title_test1', encoding='utf-8')
至此,数据集title_set1——Count>10构造完成! used for classification model
In [1]:
import numpy as np
import pandas as pd
experts_count=pd.read_pickle('experts_count.pkl')
experts_count=experts_count.fillna('none')
In [2]:
train = experts_count[:80000]
test = experts_count[80000:]
title_train = train.drop(['Id','QuestionBody','CodeBody','Tag', \
'ExpertId','Count',],axis=1)
title_test = test.drop(['Id','QuestionBody','CodeBody','Tag', \
'ExpertId','Count',],axis=1)
In [3]:
title_train = title_train.groupby('Label',as_index=True).agg(lambda x: ' '.join(x))
title_test = title_test.set_index('Label')
In [4]:
title_train.info()
In [5]:
title_test.info()
In [7]:
train_unique_expert = train.ExpertId.unique()
print ("number of experts in train set: %r " % train_unique_expert.shape)
test_unique_expert = test.ExpertId.unique()
print ("number of experts in test set: %r" % test_unique_expert.shape)
print ("type : %r" % type(test_unique_expert))
l = np.intersect1d(train_unique_expert,test_unique_expert)
print ("the number of experts both in train set and test set: %r" % l.shape)
# 1417 - 698 = 719, 719 + 8285 = 9004. : 有719个专家是在测试集中新出现的。
In [8]:
title_train.to_csv('title_train_similarity',encoding='utf-8')
title_test.to_csv('title_test_similarity',encoding='utf-8')
至此,数据集title_set构造完成! used for similarity model
去停用词
In [ ]:
import nltk
stopset = set(nltk.corpus.stopwords.words('english'))
In [ ]:
texts = list(experts_count.Title)
# Tokenize the titles
texts = [nltk.word_tokenize(text) for text in texts]
# pos tag the tokens
txtpos = [nltk.pos_tag(texts) for texts in texts]
# for titles we only care about verbs and nouns
txtpos = [[w for w in s if (w[1][0] == 'N' or w[1][0] == 'V') and \
w[0].lower() not in stopset]
for s in txtpos]
In [ ]:
In [ ]:
In [ ]:
In [ ]:
qbodys = list(dfFinal.QuestionBody)
#break into sentences
qsents = [nltk.sent_tokenize(text) for text in qbodys]
# Tokenize the question body
qbodys = [nltk.word_tokenize(text) for text in qbodys]
# attach tags to the body
qpos = [nltk.pos_tag(texts) for texts in qbodys]
In [ ]:
In [59]:
import cPickle
cPickle.dump([train1, test1], open('data1.pkl','wb'))
In [ ]: