In [1]:
from __future__ import division
import re
from pandas import read_csv
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform.br import BinaryRelevance
In [2]:
# the original file has been divided into 100 pieces
# aa is but one piece
df = read_csv("../data/pieces/aa",
names=['id','title','body','tags'],
header=None)
In [3]:
df.head()
Out[3]:
In [4]:
df["id"]=df["id"].apply(lambda str: str.strip().lstrip('"').rstrip('"'))
df.head()
Out[4]:
In [5]:
pat = re.compile('<[^>]+>')
def preprocessor(s):
return pat.sub(' ',s).lower()
def tokenizer(s):
return s.split()
In [7]:
# i've doubled weight for the title as it should be more important
text_data = df["title"] + ' ' + df["title"] + ' ' + df["body"]
vectorizerX = TfidfVectorizer(preprocessor=preprocessor, max_features=1000)
# print(type(title_data))
X = vectorizerX.fit_transform(text_data.values)
In [8]:
# now extract features for labels
# regular features, no tfidf this time
# maybe what I want is a CountVectorizer?
tag_data = df["tags"]
token_pattern = '(?u)\b\w+\b' # allow 1-letter tokens
vectorizerY = CountVectorizer(tokenizer=tokenizer, token_pattern=token_pattern,max_features=1000)
Y = vectorizerY.fit_transform(tag_data.values)
In [9]:
N = text_data.size
N
Out[9]:
In [13]:
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X,Y,test_size=0.80, random_state=42)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape
Out[13]:
In [14]:
clf = BinaryRelevance(GaussianNB())
clf.fit(X_train, Y_train)
# cv = cross_validation.ShuffleSplit(n=Y.shape[0],n_iter=3, test_size=0.3,random_state=0)
predictions = clf.predict(X_test)
score = f1_score(y_test,predictions)
In [15]:
score
Out[15]:
In [ ]: