In [1]:
import numpy as np
np.random.seed(2345)

import pandas as pd

In [2]:
questions = pd.read_csv("./Questions.csv", encoding='latin1')
answers = pd.read_csv("./Answers.csv", encoding='latin1')
tags = pd.read_csv("./Tags.csv", encoding='latin1')

In [3]:
tags.head()


Out[3]:
Id Tag
0 77434 vector
1 79709 memory
2 79709 function
3 79709 global-variables
4 79709 side-effects

In [4]:
answers.head()


Out[4]:
Id OwnerUserId CreationDate ParentId Score IsAcceptedAnswer Body
0 79741 3259.0 2008-09-17T03:43:22Z 79709 -1 False <p>It's tough to say definitively without know...
1 79768 6043.0 2008-09-17T03:48:29Z 79709 5 False <p>use variables in the outer function instead...
2 79779 8002.0 2008-09-17T03:49:36Z 79709 0 False <p>Third approach: inner function returns a re...
3 79788 NaN 2008-09-17T03:51:30Z 79709 3 False <p>It's not going to make much difference to m...
4 79827 14257.0 2008-09-17T03:58:26Z 79709 1 False <p>I'm not sure I understand the question, but...

In [5]:
questions.head()


Out[5]:
Id OwnerUserId CreationDate Score Title Body
0 77434 14008.0 2008-09-16T21:40:29Z 134 How to access the last value in a vector? <p>Suppose I have a vector that is nested in a...
1 79709 NaN 2008-09-17T03:39:16Z 1 Worse sin: side effects or passing massive obj... <p>I have a function inside a loop inside a fu...
2 95007 15842.0 2008-09-18T17:59:19Z 48 Explain the quantile() function in R <p>I've been mystified by the R quantile funct...
3 103312 NaN 2008-09-19T16:09:26Z 4 How to test for the EOF flag in R? <p>How can I test for the <code>EOF</code> fla...
4 255697 1941213.0 2008-11-01T15:48:30Z 3 Is there an R package for learning a Dirichlet... <p>I'm looking for a an <code>R</code> package...

In [6]:
questions.info() # Id Title Body are used for constructing the dataset


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147075 entries, 0 to 147074
Data columns (total 6 columns):
Id              147075 non-null int64
OwnerUserId     146129 non-null float64
CreationDate    147075 non-null object
Score           147075 non-null int64
Title           147075 non-null object
Body            147075 non-null object
dtypes: float64(1), int64(2), object(3)
memory usage: 6.7+ MB

In [7]:
answers.info() # OwnerUserId ParentId IsAcceptedAnswer are used for constructing the dataset, maybe score can be used in the future


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198592 entries, 0 to 198591
Data columns (total 7 columns):
Id                  198592 non-null int64
OwnerUserId         197964 non-null float64
CreationDate        198592 non-null object
ParentId            198592 non-null int64
Score               198592 non-null int64
IsAcceptedAnswer    198592 non-null bool
Body                198592 non-null object
dtypes: bool(1), float64(1), int64(3), object(2)
memory usage: 9.3+ MB

In [8]:
tags.info() # Id and Tag are useful


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241546 entries, 0 to 241545
Data columns (total 2 columns):
Id     241546 non-null int64
Tag    241485 non-null object
dtypes: int64(1), object(1)
memory usage: 3.7+ MB

先处理questions,数据清洗


In [9]:
# extract all the code part 
temp_code = questions['Body'].str.extractall(r'(<code>[^<]+</code>)')

In [10]:
temp_code.head()


Out[10]:
0
match
0 0 <code>length()</code>
1 <code>$#</code>
2 <code>dat$vec1$vec2[$#]\n</code>
3 <code>dat$vec1$vec2[length(dat$vec1$vec2)]\n</...
1 0 <code>for (dataset in list_of_datasets) {\n f...

In [11]:
# unstack and convert into a single column for cleaning
test = temp_code.unstack('match')

test.columns = test.columns.droplevel()
# put all columns together
code = pd.DataFrame(test.apply(lambda x: x.str.cat(), axis=1,reduce=True))
# rename 
code.columns = ['CodeBody']
# remove the html tags finally
code['CodeBody'] = code['CodeBody'].str.replace(r'<[^>]+>|\n|\r',' ')

In [12]:
# remove the code part from questions
body = questions['Body'].str.replace(r'<code>[^<]+</code>',' ')
# build up the question part from questions
questions['QuestionBody'] = body.str.replace(r"<[^>]+>|\n|\r", " ")

In [13]:
# Join the codebody by index
questions = questions.join(code)
# final cleaned dataset
questions_final = questions.drop('Body',axis=1)

In [14]:
questions_final.head()


Out[14]:
Id OwnerUserId CreationDate Score Title QuestionBody CodeBody
0 77434 14008.0 2008-09-16T21:40:29Z 134 How to access the last value in a vector? Suppose I have a vector that is nested in a d... length() $# dat$vec1$vec2[$#] dat$vec1$ve...
1 79709 NaN 2008-09-17T03:39:16Z 1 Worse sin: side effects or passing massive obj... I have a function inside a loop inside a func... for (dataset in list_of_datasets) { for (da...
2 95007 15842.0 2008-09-18T17:59:19Z 48 Explain the quantile() function in R I've been mystified by the R quantile functio... NaN
3 103312 NaN 2008-09-19T16:09:26Z 4 How to test for the EOF flag in R? How can I test for the flag in R? For e... EOF f &lt;- file(fname, "rb") while (???) { ...
4 255697 1941213.0 2008-11-01T15:48:30Z 3 Is there an R package for learning a Dirichlet... I'm looking for a an package which can be u... R R

In [15]:
questions_final.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147075 entries, 0 to 147074
Data columns (total 7 columns):
Id              147075 non-null int64
OwnerUserId     146129 non-null float64
CreationDate    147075 non-null object
Score           147075 non-null int64
Title           147075 non-null object
QuestionBody    147075 non-null object
CodeBody        134507 non-null object
dtypes: float64(1), int64(2), object(4)
memory usage: 7.9+ MB

再处理tags,将其拼在questions后


In [16]:
tags = tags[tags.Tag.notnull()]

In [17]:
tagsByquestion = tags.groupby('Id',as_index=False).agg(lambda x: ' '.join(x))

In [18]:
tagsByquestion.head()


Out[18]:
Id Tag
0 77434 vector
1 79709 memory function global-variables side-effects
2 95007 math statistics
3 103312 file file-io eof
4 255697 math statistics bayesian dirichlet

In [19]:
tagsByquestion.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 121862 entries, 0 to 121861
Data columns (total 2 columns):
Id     121862 non-null int64
Tag    121862 non-null object
dtypes: int64(1), object(1)
memory usage: 2.8+ MB

In [20]:
questions_tags = questions_final.merge(tagsByquestion,on='Id',how='left')

In [21]:
questions_tags.head()


Out[21]:
Id OwnerUserId CreationDate Score Title QuestionBody CodeBody Tag
0 77434 14008.0 2008-09-16T21:40:29Z 134 How to access the last value in a vector? Suppose I have a vector that is nested in a d... length() $# dat$vec1$vec2[$#] dat$vec1$ve... vector
1 79709 NaN 2008-09-17T03:39:16Z 1 Worse sin: side effects or passing massive obj... I have a function inside a loop inside a func... for (dataset in list_of_datasets) { for (da... memory function global-variables side-effects
2 95007 15842.0 2008-09-18T17:59:19Z 48 Explain the quantile() function in R I've been mystified by the R quantile functio... NaN math statistics
3 103312 NaN 2008-09-19T16:09:26Z 4 How to test for the EOF flag in R? How can I test for the flag in R? For e... EOF f &lt;- file(fname, "rb") while (???) { ... file file-io eof
4 255697 1941213.0 2008-11-01T15:48:30Z 3 Is there an R package for learning a Dirichlet... I'm looking for a an package which can be u... R R math statistics bayesian dirichlet

In [22]:
questions_tags.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 147075 entries, 0 to 147074
Data columns (total 8 columns):
Id              147075 non-null int64
OwnerUserId     146129 non-null float64
CreationDate    147075 non-null object
Score           147075 non-null int64
Title           147075 non-null object
QuestionBody    147075 non-null object
CodeBody        134507 non-null object
Tag             121862 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 10.1+ MB

In [23]:
questions_tags = questions_tags.drop(['OwnerUserId','CreationDate','Score'], axis=1)

In [24]:
questions_tags.head()


Out[24]:
Id Title QuestionBody CodeBody Tag
0 77434 How to access the last value in a vector? Suppose I have a vector that is nested in a d... length() $# dat$vec1$vec2[$#] dat$vec1$ve... vector
1 79709 Worse sin: side effects or passing massive obj... I have a function inside a loop inside a func... for (dataset in list_of_datasets) { for (da... memory function global-variables side-effects
2 95007 Explain the quantile() function in R I've been mystified by the R quantile functio... NaN math statistics
3 103312 How to test for the EOF flag in R? How can I test for the flag in R? For e... EOF f &lt;- file(fname, "rb") while (???) { ... file file-io eof
4 255697 Is there an R package for learning a Dirichlet... I'm looking for a an package which can be u... R R math statistics bayesian dirichlet

In [25]:
questions_tags.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 147075 entries, 0 to 147074
Data columns (total 5 columns):
Id              147075 non-null int64
Title           147075 non-null object
QuestionBody    147075 non-null object
CodeBody        134507 non-null object
Tag             121862 non-null object
dtypes: int64(1), object(4)
memory usage: 6.7+ MB

再处理answers,找出有最佳回答的问题


In [26]:
accepted_answers = answers[answers.IsAcceptedAnswer == True]

In [27]:
accepted_answers.head()


Out[27]:
Id OwnerUserId CreationDate ParentId Score IsAcceptedAnswer Body
12 255992 23263.0 2008-11-01T19:29:54Z 255697 2 True <p>I've only come across both R and the Dirich...
13 359458 3201.0 2008-12-11T14:06:56Z 359438 2 True <p>I have used <a href="http://cran.r-project....
15 440066 37751.0 2009-01-13T18:00:54Z 439526 8 True <p>Clearly I should have worked on this for an...
18 455286 54904.0 2009-01-18T15:12:24Z 445059 7 True <p>Here's what seems like another very R-type ...
19 467131 57626.0 2009-01-21T21:38:10Z 467110 11 True <p>In most cases R is an interpreted language ...

In [28]:
accepted_answers.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 87661 entries, 12 to 198588
Data columns (total 7 columns):
Id                  87661 non-null int64
OwnerUserId         87404 non-null float64
CreationDate        87661 non-null object
ParentId            87661 non-null int64
Score               87661 non-null int64
IsAcceptedAnswer    87661 non-null bool
Body                87661 non-null object
dtypes: bool(1), float64(1), int64(3), object(2)
memory usage: 4.8+ MB

In [29]:
%matplotlib inline

In [30]:
# Let's compute the number of best answers the experts have proposed:
accepted_answers["OwnerUserId"].value_counts().head(10).plot(kind="barh")


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x84c1f860>

In [31]:
accepted_answers["OwnerUserId"].value_counts().head(10)


Out[31]:
3732271.0    4380
1855677.0    1954
2372064.0    1495
1270695.0    1360
1412059.0    1102
143305.0     1083
190277.0     1075
516548.0      995
1838509.0     991
271616.0      927
Name: OwnerUserId, dtype: int64

In [32]:
accepted_answers = accepted_answers.drop(['Id','CreationDate','Score','IsAcceptedAnswer' ,'Body'], axis=1)

In [33]:
col_mapping = {'OwnerUserId' : 'ExpertId',
               'ParentId' : 'Id'}
accepted_answers = accepted_answers.rename(columns=col_mapping, copy = False)

In [34]:
accepted_answers.head()


Out[34]:
ExpertId Id
12 23263.0 255697
13 3201.0 359438
15 37751.0 439526
18 54904.0 445059
19 57626.0 467110

In [35]:
accepted_answers.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 87661 entries, 12 to 198588
Data columns (total 2 columns):
ExpertId    87404 non-null float64
Id          87661 non-null int64
dtypes: float64(1), int64(1)
memory usage: 2.0 MB

In [36]:
accepted_answers = accepted_answers.dropna()

In [37]:
accepted_answers.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 87404 entries, 12 to 198588
Data columns (total 2 columns):
ExpertId    87404 non-null float64
Id          87404 non-null int64
dtypes: float64(1), int64(1)
memory usage: 2.0 MB

In [38]:
unique_expert = accepted_answers.ExpertId.unique()

In [39]:
unique_expert.shape


Out[39]:
(9004L,)

In [54]:
count = accepted_answers['ExpertId'].value_counts()

In [55]:
count_df = pd.DataFrame(count)

In [56]:
count_df = count_df.reset_index()

In [57]:
count_df


Out[57]:
index ExpertId
0 3732271.0 4380
1 1855677.0 1954
2 2372064.0 1495
3 1270695.0 1360
4 1412059.0 1102
5 143305.0 1083
6 190277.0 1075
7 516548.0 995
8 1838509.0 991
9 271616.0 927
10 324364.0 899
11 980833.0 832
12 3063910.0 776
13 1627235.0 753
14 3001626.0 729
15 559784.0 726
16 429846.0 711
17 602276.0 656
18 1201032.0 652
19 2985007.0 637
20 1385941.0 579
21 1857266.0 548
22 1457051.0 540
23 1327739.0 539
24 496803.0 497
25 3093387.0 486
26 496488.0 484
27 211116.0 469
28 903061.0 443
29 2725969.0 414
... ... ...
8974 623518.0 1
8975 4615298.0 1
8976 1016716.0 1
8977 1153897.0 1
8978 970171.0 1
8979 5664255.0 1
8980 970195.0 1
8981 1016714.0 1
8982 576668.0 1
8983 1247080.0 1
8984 27678.0 1
8985 3018549.0 1
8986 3353500.0 1
8987 2304944.0 1
8988 2829414.0 1
8989 5658870.0 1
8990 4989385.0 1
8991 3353760.0 1
8992 2829605.0 1
8993 3878253.0 1
8994 3018873.0 1
8995 5659679.0 1
8996 2829961.0 1
8997 4611417.0 1
8998 4611622.0 1
8999 5660437.0 1
9000 419338.0 1
9001 5661391.0 1
9002 4988601.0 1
9003 5559163.0 1

9004 rows × 2 columns


In [58]:
col_mapping2 = {'ExpertId' : 'Count',
                'index' : 'ExpertId'}
count_df = count_df.rename(columns=col_mapping2, copy = False)

In [59]:
count_df


Out[59]:
ExpertId Count
0 3732271.0 4380
1 1855677.0 1954
2 2372064.0 1495
3 1270695.0 1360
4 1412059.0 1102
5 143305.0 1083
6 190277.0 1075
7 516548.0 995
8 1838509.0 991
9 271616.0 927
10 324364.0 899
11 980833.0 832
12 3063910.0 776
13 1627235.0 753
14 3001626.0 729
15 559784.0 726
16 429846.0 711
17 602276.0 656
18 1201032.0 652
19 2985007.0 637
20 1385941.0 579
21 1857266.0 548
22 1457051.0 540
23 1327739.0 539
24 496803.0 497
25 3093387.0 486
26 496488.0 484
27 211116.0 469
28 903061.0 443
29 2725969.0 414
... ... ...
8974 623518.0 1
8975 4615298.0 1
8976 1016716.0 1
8977 1153897.0 1
8978 970171.0 1
8979 5664255.0 1
8980 970195.0 1
8981 1016714.0 1
8982 576668.0 1
8983 1247080.0 1
8984 27678.0 1
8985 3018549.0 1
8986 3353500.0 1
8987 2304944.0 1
8988 2829414.0 1
8989 5658870.0 1
8990 4989385.0 1
8991 3353760.0 1
8992 2829605.0 1
8993 3878253.0 1
8994 3018873.0 1
8995 5659679.0 1
8996 2829961.0 1
8997 4611417.0 1
8998 4611622.0 1
8999 5660437.0 1
9000 419338.0 1
9001 5661391.0 1
9002 4988601.0 1
9003 5559163.0 1

9004 rows × 2 columns

整合数据


In [60]:
questions_answers = questions_tags.merge(accepted_answers,on='Id',how='right')

In [61]:
type(questions_answers)


Out[61]:
pandas.core.frame.DataFrame

In [62]:
questions_answers.head()


Out[62]:
Id Title QuestionBody CodeBody Tag ExpertId
0 95007 Explain the quantile() function in R I've been mystified by the R quantile functio... NaN math statistics 79513.0
1 255697 Is there an R package for learning a Dirichlet... I'm looking for a an package which can be u... R R math statistics bayesian dirichlet 23263.0
2 359438 Optimization packages for R Does anyone know of any optimization packages... NaN mathematical-optimization 3201.0
3 439526 Thinking in Vectors with R I know that R works most efficiently with vec... st p1 p2 st&lt;-NULL p1&lt;-NULL p2&lt;-NU... vector 37751.0
4 445059 Vectorize my thinking: Vector Operations in R So earlier I answered my own question on thin... for (j in my.data$item[my.data$fixed==0]) { #... vector 54904.0

In [63]:
questions_answers.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 87404 entries, 0 to 87403
Data columns (total 6 columns):
Id              87404 non-null int64
Title           87404 non-null object
QuestionBody    87404 non-null object
CodeBody        81701 non-null object
Tag             71861 non-null object
ExpertId        87404 non-null float64
dtypes: float64(1), int64(1), object(4)
memory usage: 4.7+ MB

In [64]:
experts_count = questions_answers.merge(count_df, on='ExpertId', how='left')

In [65]:
experts_count.head()


Out[65]:
Id Title QuestionBody CodeBody Tag ExpertId Count
0 95007 Explain the quantile() function in R I've been mystified by the R quantile functio... NaN math statistics 79513.0 1
1 255697 Is there an R package for learning a Dirichlet... I'm looking for a an package which can be u... R R math statistics bayesian dirichlet 23263.0 1
2 359438 Optimization packages for R Does anyone know of any optimization packages... NaN mathematical-optimization 3201.0 1
3 439526 Thinking in Vectors with R I know that R works most efficiently with vec... st p1 p2 st&lt;-NULL p1&lt;-NULL p2&lt;-NU... vector 37751.0 53
4 445059 Vectorize my thinking: Vector Operations in R So earlier I answered my own question on thin... for (j in my.data$item[my.data$fixed==0]) { #... vector 54904.0 1

In [139]:
experts_count=experts_count.iloc[:70000]

In [70]:
final.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 67176 entries, 3 to 87403
Data columns (total 7 columns):
Id              67176 non-null int64
Title           67176 non-null object
QuestionBody    67176 non-null object
CodeBody        63348 non-null object
Tag             54534 non-null object
ExpertId        67176 non-null float64
Count           67176 non-null int64
dtypes: float64(1), int64(2), object(4)
memory usage: 4.1+ MB

In [71]:
final_unique_expert = final.ExpertId.unique()
final_unique_expert.shape


Out[71]:
(508L,)

In [162]:
import pickle
pickle.dump(df,open('qa.pkl','wb'))

In [74]:
df['ExpertId']=np.array(df['ExpertId']).astype(np.int32)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-74-462fee490335> in <module>()
----> 1 df['ExpertId']=np.array(df['ExpertId']).astype(np.int32)

NameError: name 'df' is not defined

In [47]:
db=df['ExpertId'].value_counts()

搭建模型


In [75]:
import sys
sys.path.append('d:/miniconda/lib/site-packages')
import jieba
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedShuffleSplit,StratifiedKFold,cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
import pickle


D:\miniconda\envs\python27\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [76]:
df=pd.read_pickle('qa.pkl')

In [175]:
df=df.fillna('none')

In [133]:
tfv = TfidfVectorizer(min_df=3,max_df=0.95,sublinear_tf=True)
x_title = tfv.fit_transform(df['Title'])
x_q=tfv.fit_transform(df['QuestionBody'])
x_tag=tfv.fit_transform(df['Tag'])

In [204]:
df


Out[204]:
Id Title QuestionBody CodeBody Tag ExpertId Count cate
0 95007 Explain the quantile() function in R I've been mystified by the R quantile functio... none math statistics 79513.0 1 218
1 255697 Is there an R package for learning a Dirichlet... I'm looking for a an package which can be u... R R math statistics bayesian dirichlet 23263.0 1 91
2 359438 Optimization packages for R Does anyone know of any optimization packages... none mathematical-optimization 3201.0 1 15
3 439526 Thinking in Vectors with R I know that R works most efficiently with vec... st p1 p2 st&lt;-NULL p1&lt;-NULL p2&lt;-NU... vector 37751.0 53 121
4 445059 Vectorize my thinking: Vector Operations in R So earlier I answered my own question on thin... for (j in my.data$item[my.data$fixed==0]) { #... vector 54904.0 1 163
5 467110 Is R a compiled language? I can't find it anywhere on the web (and I do... none language-features 57626.0 1 171
6 476342 Converting a localized date of the form 12-okt... I have imported a time series with dates of t... test = c("11-Feb-01","12-Feb-01","01-Mai-08"... date 12677.0 2 54
7 476726 Filtering data in R I have a CSV of file of data that I can load ... read.csv() NULL DocID Anno1 Anno7 ... filtering 58681.0 2 173
8 495744 Operating with time intervals like 08:00-08:15 I would like to import a time-series where th... 08:00-08:15 08:15-08:30 08:30-08:45 time-series 148801.0 2 370
9 498932 What's the easiest way to install 100s of file... I have a standard c# application that acts as... none visual-studio setup-project 445.0 1 3
10 509595 csv file with multiple time-series I've imported a csv file with lots of columns... v &lt;- read.csv2("200109.csv", header=TRUE, ... time-series 26575.0 6 102
11 520810 Does R have quote-like operators like Perl's q... Anyone know if R has quote-like operators lik... qw() perl 16632.0 223 68
12 560329 Sort the X axis in a barplot I have binned data that looks like this: ... (8.048,18.05] (-21.95,-11.95] (-31.95,-21.9... statistics 37751.0 53 121
13 582653 what is the best practice of handling time in R? I am working with a survey dataset. It has tw... as.date() as.Date() chron() as.POSIXct() ... datetime 4892.0 4 26
14 596819 What is the best way to avoid passing a data f... I have 12 data frames to work with. They are ... doSomething &lt;- function(df) { // do some... none 37751.0 53 121
15 596976 What is the Y function? A friend of mine asked me if I understood the... ? Y none 63225.0 1 187
16 652136 How can I remove an element from a list? I have a list and I want to remove a single e... none list indexing 41665.0 1 132
17 657440 Cumulative Plot with Given X-Axis I have data that looks like this. In which I ... #x-axis dat1 dat2 -10 0.0140... statistics 37751.0 53 121
18 717747 How do I color edges or draw rects correctly i... I generated this dendrogram using R's , ... hclust() as.dendrogram() plot.dendrogram() ... edge dendrogram hclust 62970.0 1 186
19 736514 R Random Forests Variable Importance I am trying to use the random forests package... MeanDecreaseAccuracy MeanDecreaseGini MeanD... statistics data-mining random-forest 90567.0 2 241
20 736541 Plots without titles/labels in R In R is there any way to produce plots whic... plot() main sub xlab ylab NULL pdf() p... statistics charts 66519.0 1 194
21 743812 Calculating moving average in R I'm trying to use R to calculate the moving a... none statistics 60617.0 9 178
22 750703 Suppressing "null device" output with R in bat... I have a number of bash scripts which invoke ... #!/bin/bash R --vanilla --slave &lt;&lt;RSCRI... bash statistics charts 26575.0 6 102
23 775116 How can a function parameter be used without m... I've been trying to learn more about R (and w... rpart &lt;- function(formula, data, weights, ... none 26575.0 6 102
24 780796 Emacs ESS Mode - Tabbing for Comment Region I am using the Emacs-Speaks-Statistics (ESS) ... # Comment #... emacs ess 16240.0 2 66
25 789602 What does %% mean? From the question you can probably tell that ... mnlong &lt;- 280.460 + .9856474 * time mnlong... syntax 57428.0 1 168
26 805027 Suppressing or setting CreationDate/ModDate in... When R creates PDFs using pdf() it includes a... none svn 14257.0 4 59
27 855798 In R, what is a good way to aggregate String data In R (or S-PLUS), what is a good way to aggre... myList &lt;- as.data.frame(c("Bob", "Mary", "... statistics s-plus 90567.0 2 241
28 876711 Plotting Simple Data in R I have a comma separated file named contain... foo.csv scale, serial, spawn, for, worker 5,... plot 39578.0 1 127
29 936748 Declaring a Const Variable in R I'm working in R, and I'd like to define some... const std::string path( "/projects/current" )... statistics const constants 16632.0 223 68
... ... ... ... ... ... ... ... ...
87374 40133202 Calculating the transition probabilities in R Let's assume that we have the following 4 sta... old new A B A A B C D ... none 4895725.0 223 7647
87375 40133444 Change not mapped color of a geom in a given plot Suppose I have a function which is not under ... ggplot geom library(ggplot2) notMyOwnFuncti... ggplot2 3120598.0 69 5516
87376 40133779 Error in reading multple text files from direc... I would like to read multiple text files from... regional_vol_GM_atlas1.txt regional_vol_GM_... text-processing data-extraction 5132823.0 12 7928
87377 40133907 Match multiple columns in two data frames and ... So I have two large (in some part different) ... IDP PREDMET ID NAME 1 120_1 ustano... none 3703936.0 3 6278
87378 40133971 Delete unconnected short paths from a graph in... I am working with networks in igraph, I have ... net &lt;- simplify(InnatedGraph, remove.mult... graph igraph 4488105.0 1 7216
87379 40134511 lapply - assign specific value to other column... I have got a list with more than 100 data.fra... [[6]] V1 V2 V3 2 ... dataframe lapply 5038157.0 3 7824
87380 40134648 Duplicating Discrete Axis in ggplot2 The development version of ggplot2 (2.1.0.900... devtools::install_github("hadley/ggplot2") li... ggplot2 1855677.0 1954 3632
87381 40134792 Fill missing data using a linear model I have some data that looks a little like thi... ID year var1 var2 1 1 1 ... none 3115675.0 2 5510
87382 40134841 ggplot line plot labels at begining of the lin... I have a data.table of values as follows: ... &gt; head(dt) id x y 1: ... ggplot2 6805241.0 16 8966
87383 40135240 R ggplot2: Add means as horizontal line in a b... I have created a boxplot using ggplot2: ... library(ggplot2) dat &lt;- data.frame(study ... ggplot2 boxplot 2461552.0 200 4554
87384 40135281 Create a table with mean duration times per ye... I have the following dataframe: Now I... library("lubridate") df = data.frame(c("AAA"... none 5202253.0 44 7987
87385 40135594 sparkR: how to create a dummy column from char... Consider the following simple example: ... df &lt;- data.frame(id=c(1:4), climate=c("col... apache-spark sparkr grepl 4964651.0 125 7728
87386 40135653 Consolidate replicated columns in R I have a data frame that is like: In... c1 c2 c3 c4 r1 1 0 1 1 r2 0 0 1 1 ... none 809198.0 2 1714
87387 40136496 Method like argument in function I want use method in python / pandas like arg... def rolling (df, prefix = 'r', window = 3, me... python function pandas methods 2336654.0 2 4375
87388 40136801 tidyr::pop_quiz: is there a faster/ more trans... I'm trying to get good with . Is there a be... tidyr anscombe ggplot2 obs_num library(ti... reshape2 tidyr tidyverse 496488.0 484 1161
87389 40136865 Obtaining 0th column data frame information in R How would you access the date column on the l... &gt; data GMT Rate 2005-02-28... vector dataframe 1968.0 181 11
87390 40137720 rvest cannot find node with xpath This is the website I scapre ppp projects ... a &lt;- html("http://www.cpppc.org:8082/efmis... html xml web-scraping rvest 1457051.0 540 2922
87391 40138536 geom_bar produces x-axis tick text aliased/dis... The following minimal example reproduces the ... library(ggplot2) x &lt;- c(1111, 2222, 3333... ggplot2 903061.0 443 1871
87392 40138635 Concatenate rows in a column by ID in R I have a geochemical data set that is has 50 ... Field_Notes Sample_ID Sampe_ID Year Alt_... none 1627235.0 753 3264
87393 40138822 Import RDS file from github into R Windows I am trying to import a RDS file into RStudio... githubURL &lt;- ("https://github.com/derek-co... import 1235433.0 1 2509
87394 40138880 From which row a data.frame variable have a co... I would like to calculate the mean of a varia... s&lt;-"no Spc PSize 2 0 ... time-series dplyr 2966222.0 48 5313
87395 40139534 Fast way to get all pairs of matrix column ele... Let's say I have a numerical : I wan... matrix set.seed(1) mat &lt;- matrix(rnorm(10... matrix product 4891738.0 407 7640
87396 40140133 Scraping tables on multiple web pages with rve... I am new to web scraping and am trying to scr... rvest library(rvest) url4 &lt;- "http://www.... web-scraping rvest 2572423.0 113 4721
87397 40140630 How to replace symbols by their value in a R f... This code reveals that doesn't yet look up ... f q q &lt;- 2 f &lt;- function(x) q + x f ... function 822162.0 8 1726
87398 40140656 Same Rcpp function returns different output if... A C++ function I wrote using Rcpp gives diffe... Rcout Rprintf 1 H_sigma_1() H_sigma_2() ... c++ rcpp 1345455.0 50 2746
87399 40140835 Elementwise vector multiplication between all ... I have a set of models and need to generate e... modset &lt;- structure(list(A = c(1, 1), B = ... matrix dataframe matrix-multiplication 4891738.0 407 7640
87400 40141353 Convert multiple columns to one column I'm looking to merge multiple columns to one ... Column A Column B Column C a1 b... python list 6105720.0 39 8634
87401 40142682 Remove everything after last space with stringr I have data that looks like this: I w... df &lt;- tribble( ~name, ~value, "Jak... regex stringr tidyverse 3732271.0 4380 6313
87402 40142818 In R, how do I order within a single column so... I am generating multiple experimental designs... df1 &lt;- structure(list(Block = c(1L, 1L, 1L... sorting order 496803.0 497 1162
87403 40143046 r dplyr group_by values collapse and paste I have a data set that looks like this ... Id Subject Date Vitals Valu... group-by dplyr collapse 903061.0 443 1871

87404 rows × 8 columns


In [205]:
#final1=experts_count[experts_count.Count > 10]
#final2 = experts_count[experts_count.Count > 20]
#final3=experts_count[experts_count.Count > 30]
#final4 = experts_count[experts_count.Count > 40]
k1=df.Count > 10
k2=df.Count > 20
k3=df.Count > 30
k4=df.Count > 40

In [206]:
x_tag1=tfv.fit_transform(list(df[:80000][k1]['Tag'])+list(df[80000:]['Tag']))
x_tag2=tfv.fit_transform(list(df[:80000][k2]['Tag'])+list(df[80000:]['Tag']))
x_tag3=tfv.fit_transform(list(df[:80000][k3]['Tag'])+list(df[80000:]['Tag']))
x_tag4=tfv.fit_transform(list(df[:80000][k4]['Tag'])+list(df[80000:]['Tag']))
#x_tag2=tfv.fit_transform(df[:70000][k2]['Tag'])
#x_tag3=tfv.fit_transform(df[:70000][k3]['Tag'])
#x_title1=tfv.fit_transform(df[:70000][k1]['Title'])
#x_title2=tfv.fit_transform(df[:70000][k2]['Title'])
#x_title3=tfv.fit_transform(df[:70000][k3]['Title'])


D:\miniconda\envs\python27\lib\site-packages\ipykernel\__main__.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  if __name__ == '__main__':
D:\miniconda\envs\python27\lib\site-packages\ipykernel\__main__.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  from ipykernel import kernelapp as app
D:\miniconda\envs\python27\lib\site-packages\ipykernel\__main__.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  app.launch_new_instance()
D:\miniconda\envs\python27\lib\site-packages\ipykernel\__main__.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.

In [211]:
final1=df[:80000][k1]
final2=df[:80000][k2]
final3=df[:80000][k3]
final4=df[:80000][k4]


D:\miniconda\envs\python27\lib\site-packages\ipykernel\__main__.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  if __name__ == '__main__':
D:\miniconda\envs\python27\lib\site-packages\ipykernel\__main__.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  from ipykernel import kernelapp as app
D:\miniconda\envs\python27\lib\site-packages\ipykernel\__main__.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  app.launch_new_instance()
D:\miniconda\envs\python27\lib\site-packages\ipykernel\__main__.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.

In [144]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
le.fit(df['ExpertId'])
le.classes_
a=le.transform(df['ExpertId'])

In [191]:
lr=LogisticRegression(C=2)
print ('1'+ 'tag')
lr.fit(x_tag1[:final1.shape[0]],df['cate'][:80000][k1])
y=lr.predict(x_tag1[final1.shape[0]:])
sklearn.metrics.accuracy_score(y,df['cate'][80000:])


1tag
Out[191]:
0.10491840956102046

In [212]:
lr=LogisticRegression(C=2)
print ('1'+ 'tag')
lr.fit(x_tag1[:final1.shape[0]],df['cate'][:80000][k1])
y=lr.predict(x_tag1[final1.shape[0]:])
sklearn.metrics.accuracy_score(y,df['cate'][80000:])


1tag
Out[212]:
0.11655861696380335

In [195]:
final2.shape[0]


Out[195]:
54853

In [199]:
print ('2'+ 'tag')
lr.fit(x_tag2[:final2.shape[0]],df['cate'][:70000][k2])
y=lr.predict(x_tag2[final2.shape[0]:])
sklearn.metrics.accuracy_score(y,df['cate'][70000:])


2tag
Out[199]:
0.10422891289358768

In [200]:
print ('3'+ 'tag')
lr.fit(x_tag3[:final3.shape[0]],df['cate'][:70000][k3])
y=lr.predict(x_tag3[final3.shape[0]:])
sklearn.metrics.accuracy_score(y,df['cate'][70000:])


3tag
Out[200]:
0.1036543323373937

In [202]:
print ('4'+ 'tag')
lr.fit(x_tag4[:final4.shape[0]],df['cate'][:70000][k4])
y=lr.predict(x_tag4[final4.shape[0]:])
sklearn.metrics.accuracy_score(y,df['cate'][70000:])


4tag
Out[202]:
0.10279246150310274

In [107]:
y=lr.predict(x_q[60000:])

In [108]:
import sklearn#f_svd+n,adj
sklearn.metrics.accuracy_score(y,a[60000:])


Out[108]:
0.12722965440356745

In [111]:
from sklearn.naive_bayes import BernoulliNB  as BNL
bnl=BNL(alpha=0.2, binarize=0, class_prior=None, fit_prior=True)
bnl.fit(x_tag[:60000],a[:60000])


Out[111]:
BernoulliNB(alpha=0.2, binarize=0, class_prior=None, fit_prior=True)

In [112]:
y=lr.predict(x_q[60000:])
sklearn.metrics.accuracy_score(y,a[60000:])


Out[112]:
0.12722965440356745

数据整合完毕,开始构造实验数据集


In [36]:
expert_profile = questions_answers.groupby('ExpertId',as_index=True).agg(lambda x: ' '.join(x))

In [37]:
expert_profile.head()


Out[37]:
Title QuestionBody
ExpertId
13.0 R "str" equivalent in perl How do you get the data structure of an objec...
337.0 Is there any command to exit R programming? Is there any command that can be used to exit...
419.0 How to run PowerShell command in R? For example, this PowerShell command returns ...
445.0 What's the easiest way to install 100s of file... I have a standard c# application that acts as...
459.0 Regular expression"\\|" in strsplit I wonder is equal to ? maybe is eq...

In [38]:
expert_profile.info()


<class 'pandas.core.frame.DataFrame'>
Float64Index: 9004 entries, 13.0 to 7034800.0
Data columns (total 2 columns):
Title           9004 non-null object
QuestionBody    9004 non-null object
dtypes: object(2)
memory usage: 211.0+ KB

In [39]:
expert_profile2 = questions_answers.set_index(['ExpertId'], drop=True)

In [56]:
expert_profile2.info()


<class 'pandas.core.frame.DataFrame'>
Float64Index: 87404 entries, 79513.0 to 903061.0
Data columns (total 5 columns):
Id              87404 non-null int64
Title           87404 non-null object
QuestionBody    87404 non-null object
CodeBody        81701 non-null object
Tag             71861 non-null object
dtypes: int64(1), object(4)
memory usage: 4.0+ MB

In [ ]: