In [1]:
import numpy as np
np.random.seed(2345)
import pandas as pd
In [2]:
questions = pd.read_csv("./Questions.csv", encoding='latin1')
answers = pd.read_csv("./Answers.csv", encoding='latin1')
tags = pd.read_csv("./Tags.csv", encoding='latin1')
In [3]:
tags.head()
Out[3]:
In [4]:
answers.head()
Out[4]:
In [5]:
questions.head()
Out[5]:
In [6]:
questions.info() # Id Title Body are used for constructing the dataset
In [7]:
answers.info() # OwnerUserId ParentId IsAcceptedAnswer are used for constructing the dataset, maybe score can be used in the future
In [8]:
tags.info() # Id and Tag are useful
先处理questions,数据清洗
In [9]:
# extract all the code part
temp_code = questions['Body'].str.extractall(r'(<code>[^<]+</code>)')
In [10]:
temp_code.head()
Out[10]:
In [11]:
# unstack and convert into a single column for cleaning
test = temp_code.unstack('match')
test.columns = test.columns.droplevel()
# put all columns together
code = pd.DataFrame(test.apply(lambda x: x.str.cat(), axis=1,reduce=True))
# rename
code.columns = ['CodeBody']
# remove the html tags finally
code['CodeBody'] = code['CodeBody'].str.replace(r'<[^>]+>|\n|\r',' ')
In [12]:
# remove the code part from questions
body = questions['Body'].str.replace(r'<code>[^<]+</code>',' ')
# build up the question part from questions
questions['QuestionBody'] = body.str.replace(r"<[^>]+>|\n|\r", " ")
In [13]:
# Join the codebody by index
questions = questions.join(code)
# final cleaned dataset
questions_final = questions.drop('Body',axis=1)
In [14]:
questions_final.head()
Out[14]:
In [15]:
questions_final.info()
再处理tags,将其拼在questions后
In [16]:
tags = tags[tags.Tag.notnull()]
In [17]:
tagsByquestion = tags.groupby('Id',as_index=False).agg(lambda x: ' '.join(x))
In [18]:
tagsByquestion.head()
Out[18]:
In [19]:
tagsByquestion.info()
In [20]:
questions_tags = questions_final.merge(tagsByquestion,on='Id',how='left')
In [21]:
questions_tags.head()
Out[21]:
In [22]:
questions_tags.info()
In [23]:
questions_tags = questions_tags.drop(['OwnerUserId','CreationDate','Score'], axis=1)
In [24]:
questions_tags.head()
Out[24]:
In [25]:
questions_tags.info()
再处理answers,找出有最佳回答的问题
In [26]:
accepted_answers = answers[answers.IsAcceptedAnswer == True]
In [27]:
accepted_answers.head()
Out[27]:
In [28]:
accepted_answers.info()
In [29]:
%matplotlib inline
In [30]:
# Let's compute the number of best answers the experts have proposed:
accepted_answers["OwnerUserId"].value_counts().head(10).plot(kind="barh")
Out[30]:
In [31]:
accepted_answers["OwnerUserId"].value_counts().head(10)
Out[31]:
In [32]:
accepted_answers = accepted_answers.drop(['Id','CreationDate','Score','IsAcceptedAnswer' ,'Body'], axis=1)
In [33]:
col_mapping = {'OwnerUserId' : 'ExpertId',
'ParentId' : 'Id'}
accepted_answers = accepted_answers.rename(columns=col_mapping, copy = False)
In [34]:
accepted_answers.head()
Out[34]:
In [35]:
accepted_answers.info()
In [36]:
accepted_answers = accepted_answers.dropna()
In [37]:
accepted_answers.info()
In [38]:
unique_expert = accepted_answers.ExpertId.unique()
In [39]:
unique_expert.shape
Out[39]:
In [54]:
count = accepted_answers['ExpertId'].value_counts()
In [55]:
count_df = pd.DataFrame(count)
In [56]:
count_df = count_df.reset_index()
In [57]:
count_df
Out[57]:
In [58]:
col_mapping2 = {'ExpertId' : 'Count',
'index' : 'ExpertId'}
count_df = count_df.rename(columns=col_mapping2, copy = False)
In [59]:
count_df
Out[59]:
整合数据
In [60]:
questions_answers = questions_tags.merge(accepted_answers,on='Id',how='right')
In [61]:
type(questions_answers)
Out[61]:
In [62]:
questions_answers.head()
Out[62]:
In [63]:
questions_answers.info()
In [64]:
experts_count = questions_answers.merge(count_df, on='ExpertId', how='left')
In [65]:
experts_count.head()
Out[65]:
In [139]:
experts_count=experts_count.iloc[:70000]
In [70]:
final.info()
In [71]:
final_unique_expert = final.ExpertId.unique()
final_unique_expert.shape
Out[71]:
In [162]:
import pickle
pickle.dump(df,open('qa.pkl','wb'))
In [74]:
df['ExpertId']=np.array(df['ExpertId']).astype(np.int32)
In [47]:
db=df['ExpertId'].value_counts()
In [75]:
import sys
sys.path.append('d:/miniconda/lib/site-packages')
import jieba
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedShuffleSplit,StratifiedKFold,cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
import pickle
In [76]:
df=pd.read_pickle('qa.pkl')
In [175]:
df=df.fillna('none')
In [133]:
tfv = TfidfVectorizer(min_df=3,max_df=0.95,sublinear_tf=True)
x_title = tfv.fit_transform(df['Title'])
x_q=tfv.fit_transform(df['QuestionBody'])
x_tag=tfv.fit_transform(df['Tag'])
In [204]:
df
Out[204]:
In [205]:
#final1=experts_count[experts_count.Count > 10]
#final2 = experts_count[experts_count.Count > 20]
#final3=experts_count[experts_count.Count > 30]
#final4 = experts_count[experts_count.Count > 40]
k1=df.Count > 10
k2=df.Count > 20
k3=df.Count > 30
k4=df.Count > 40
In [206]:
x_tag1=tfv.fit_transform(list(df[:80000][k1]['Tag'])+list(df[80000:]['Tag']))
x_tag2=tfv.fit_transform(list(df[:80000][k2]['Tag'])+list(df[80000:]['Tag']))
x_tag3=tfv.fit_transform(list(df[:80000][k3]['Tag'])+list(df[80000:]['Tag']))
x_tag4=tfv.fit_transform(list(df[:80000][k4]['Tag'])+list(df[80000:]['Tag']))
#x_tag2=tfv.fit_transform(df[:70000][k2]['Tag'])
#x_tag3=tfv.fit_transform(df[:70000][k3]['Tag'])
#x_title1=tfv.fit_transform(df[:70000][k1]['Title'])
#x_title2=tfv.fit_transform(df[:70000][k2]['Title'])
#x_title3=tfv.fit_transform(df[:70000][k3]['Title'])
In [211]:
final1=df[:80000][k1]
final2=df[:80000][k2]
final3=df[:80000][k3]
final4=df[:80000][k4]
In [144]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
le.fit(df['ExpertId'])
le.classes_
a=le.transform(df['ExpertId'])
In [191]:
lr=LogisticRegression(C=2)
print ('1'+ 'tag')
lr.fit(x_tag1[:final1.shape[0]],df['cate'][:80000][k1])
y=lr.predict(x_tag1[final1.shape[0]:])
sklearn.metrics.accuracy_score(y,df['cate'][80000:])
Out[191]:
In [212]:
lr=LogisticRegression(C=2)
print ('1'+ 'tag')
lr.fit(x_tag1[:final1.shape[0]],df['cate'][:80000][k1])
y=lr.predict(x_tag1[final1.shape[0]:])
sklearn.metrics.accuracy_score(y,df['cate'][80000:])
Out[212]:
In [195]:
final2.shape[0]
Out[195]:
In [199]:
print ('2'+ 'tag')
lr.fit(x_tag2[:final2.shape[0]],df['cate'][:70000][k2])
y=lr.predict(x_tag2[final2.shape[0]:])
sklearn.metrics.accuracy_score(y,df['cate'][70000:])
Out[199]:
In [200]:
print ('3'+ 'tag')
lr.fit(x_tag3[:final3.shape[0]],df['cate'][:70000][k3])
y=lr.predict(x_tag3[final3.shape[0]:])
sklearn.metrics.accuracy_score(y,df['cate'][70000:])
Out[200]:
In [202]:
print ('4'+ 'tag')
lr.fit(x_tag4[:final4.shape[0]],df['cate'][:70000][k4])
y=lr.predict(x_tag4[final4.shape[0]:])
sklearn.metrics.accuracy_score(y,df['cate'][70000:])
Out[202]:
In [107]:
y=lr.predict(x_q[60000:])
In [108]:
import sklearn#f_svd+n,adj
sklearn.metrics.accuracy_score(y,a[60000:])
Out[108]:
In [111]:
from sklearn.naive_bayes import BernoulliNB as BNL
bnl=BNL(alpha=0.2, binarize=0, class_prior=None, fit_prior=True)
bnl.fit(x_tag[:60000],a[:60000])
Out[111]:
In [112]:
y=lr.predict(x_q[60000:])
sklearn.metrics.accuracy_score(y,a[60000:])
Out[112]:
数据整合完毕,开始构造实验数据集
In [36]:
expert_profile = questions_answers.groupby('ExpertId',as_index=True).agg(lambda x: ' '.join(x))
In [37]:
expert_profile.head()
Out[37]:
In [38]:
expert_profile.info()
In [39]:
expert_profile2 = questions_answers.set_index(['ExpertId'], drop=True)
In [56]:
expert_profile2.info()
In [ ]: