In [1]:
# all of the imports
import pandas as pd
import numpy as np
import pickle
import patsy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
% matplotlib inline
from sklearn import preprocessing as pp
import warnings
warnings.filterwarnings('ignore')
In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_curve, auc
In [3]:
import nltk
import itertools
from nltk.probability import FreqDist
stopset = set(nltk.corpus.stopwords.words('english'))
In [4]:
def rotate(ax, degree):
for tick in ax.get_xticklabels():
tick.set_rotation(degree)
In [5]:
questions = pd.read_csv('./Questions.csv',encoding='latin1')
In [6]:
questions.info()
Conclusion:OwnerUserId存在缺失值
In [8]:
questions.head()
Out[8]:
In [9]:
# extract all the code part
a = questions['Body'].str.extractall(r'(<code>[^<]+</code>)')
In [13]:
type(a)
a.head(10)
Out[13]:
In [10]:
# unstack and convert into a single column for cleaning
test = a.unstack('match')
test.columns = test.columns.droplevel()
# put all columns together
code = pd.DataFrame(test.apply(lambda x: x.str.cat(), axis=1,reduce=True))
# rename
code.columns = ['CodeBody']
# remove the html tags finally
code['CodeBody'] = code['CodeBody'].str.replace(r'<[^>]+>|\n|\r',' ')
In [14]:
# remove the code part from questions
body = questions['Body'].str.replace(r'<code>[^<]+</code>',' ')
# build up the question part from questions
questions['QuestionBody'] = body.str.replace(r"<[^>]+>|\n|\r", " ")
In [15]:
# Join the codebody by index
questions = questions.join(code)
# final cleaned dataset
questions_final = questions.drop('Body',axis=1)
In [16]:
# assume all answers without userID are from the same guy ID 0
questions_final['OwnerUserId'].fillna(0,inplace=True)
questions_final.OwnerUserId = questions_final.OwnerUserId.astype(int)
In [27]:
questions_final.head()
Out[27]:
In [28]:
questions_final.shape
Out[28]:
In [29]:
dfFinal = questions_final.loc[(questions_final.Score>=5) | \
(questions_final.Score<0)]
In [30]:
dfFinal.head()
Out[30]:
In [31]:
dfFinal.shape
Out[31]:
In [24]:
texts = list(dfFinal.Title)
# Tokenize the titles
texts = [nltk.word_tokenize(text) for text in texts]
# pos tag the tokens
txtpos = [nltk.pos_tag(texts) for texts in texts]
# for titles we only care about verbs and nouns
txtpos = [[w for w in s if (w[1][0] == 'N' or w[1][0] == 'V') and \
w[0].lower() not in stopset]
for s in txtpos]
In [32]:
qbodys = list(dfFinal.QuestionBody)
#break into sentences
qsents = [nltk.sent_tokenize(text) for text in qbodys]
# Tokenize the question body
qbodys = [nltk.word_tokenize(text) for text in qbodys]
# attach tags to the body
qpos = [nltk.pos_tag(texts) for texts in qbodys]
In [ ]:
In [17]:
tags = pd.read_csv('./Tags.csv',encoding='latin1')
tags.head()
Out[17]:
In [18]:
tags = tags[tags.Tag.notnull()]
#tags.to_csv('tags_final.csv',index=False)
In [19]:
#tags.groupby('Id').count()
fig, ax = plt.subplots()
sns.distplot(tags.groupby('Id').count())
ax.set_xlabel('number of tags')
Out[19]:
In [22]:
fig, ax = plt.subplots()
sns.distplot(questions[questions.Score< 20].Score,kde=False)
ax.set_xlabel('distribution of scores')
Out[22]:
In [33]:
tagsByquestion = tags.groupby('Id',as_index=False).agg(lambda x: ' '.join(x))
In [34]:
tagsByquestion.head()
Out[34]:
In [35]:
dfFinal = dfFinal.merge(tagsByquestion,on='Id',how='left')
In [36]:
dfFinal.head()
Out[36]:
In [37]:
tagNum = tags.groupby('Id')['Tag'].count()
tagNum.columns = ['Id', 'tagNum']
tagNum = pd.DataFrame(tagNum).reset_index(level=0)
In [38]:
dfFinal = dfFinal.merge(tagNum, on='Id',how='left')
In [39]:
dfFinal.head()
Out[39]:
In [40]:
dfFinal.columns
Out[40]:
In [79]:
dfFinal.loc[dfFinal.Score<0,'label'] = 'Bad'
dfFinal.loc[dfFinal.Score>=5,'label'] = 'Good'
In [80]:
dfFinal.head()
Out[80]:
In [81]:
dfFinal.columns
Out[81]:
In [82]:
dftest = dfFinal.drop(['OwnerUserId','CreationDate','Score','CodeBody'],\
axis=1)
In [83]:
Y = dftest.label
X = dftest.drop(['label','Id'], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.30)
In [84]:
'''This is our baseline of model accuracy. We need to beat this accuracy
while trying to maximize our recall on bad labels.'''
def dummyGuess(x):
return pd.Series(['Good'] * len(x))
In [85]:
accuracy_score(Y_test, dummyGuess(Y_test))
Out[85]:
In [ ]:
In [101]:
'''Using nltk tokenizer will significantly slow down the fitting time but will slightly increase the accuracy.
I trained the model with default tokenizer and then switched to nltk tokenizer later on'''
pipeline = Pipeline([
('features', FeatureUnion(
transformer_list = [
('title',Pipeline([
('count', TfidfVectorizer(stop_words=stopset,min_df=0.03,max_df=0.7,tokenizer=nltk.word_tokenize)),
])),
('question', Pipeline([
('tfidf', CountVectorizer(stop_words=stopset,min_df=0.01,max_df=0.8,tokenizer=nltk.word_tokenize)),
])),
],
# the weight was trained seperately,
# I also controlled the weight to be fairly equal assignned.
transformer_weights={
'title':0.6,
'question': 0.4
}
)),
('scaler',Normalizer()),
('estimators', SGDClassifier(alpha=0.001,loss='modified_huber',penalty='l2')),
])
In [98]:
pipeline.fit(X_train, Y_train)
y = pipeline.predict(X_test)
accuracy_score(Y_test, y)
print(classification_report(Y_test, y))
In [100]:
test = pipeline.predict_proba(X_test)
predict = ['Bad' if pair[0]>=0.35 else 'Good' for pair in test]
print(classification_report(Y_test,predict))
print(accuracy_score(Y_test,predict))
In [ ]: