In [1]:
from __future__ import division

import re

from pandas import read_csv
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.naive_bayes import GaussianNB

from skmultilearn.problem_transform.br import BinaryRelevance

In [2]:
# the original file has been divided into 100 pieces
# aa is but one piece
df = read_csv("../data/pieces/aa",
    names=['id','title','body','tags'],
    header=None)

In [3]:
df.head()


Out[3]:
id title body tags
0 1 How to check if an uploaded file is an image w... <p>I'd like to check if an uploaded file is an... php image-processing file-upload upload mime-t...
1 "2" How can I prevent firefox from closing when I ... <p>In my favorite editor (vim), I regularly us... firefox
2 "3" R Error Invalid type (list) for variable <p>I am import matlab file and construct a dat... r matlab machine-learning
3 "4" How do I replace special characters in a URL? <p>This is probably very simple, but I simply ... c# url encoding
4 "5" How to modify whois contact details? <pre><code>function modify(.......) { $mcont... php api file-get-contents

In [4]:
df["id"]=df["id"].apply(lambda str: str.strip().lstrip('"').rstrip('"'))
df.head()


Out[4]:
id title body tags
0 1 How to check if an uploaded file is an image w... <p>I'd like to check if an uploaded file is an... php image-processing file-upload upload mime-t...
1 2 How can I prevent firefox from closing when I ... <p>In my favorite editor (vim), I regularly us... firefox
2 3 R Error Invalid type (list) for variable <p>I am import matlab file and construct a dat... r matlab machine-learning
3 4 How do I replace special characters in a URL? <p>This is probably very simple, but I simply ... c# url encoding
4 5 How to modify whois contact details? <pre><code>function modify(.......) { $mcont... php api file-get-contents

In [5]:
pat = re.compile('<[^>]+>')
def preprocessor(s):
    return pat.sub(' ',s).lower()

def tokenizer(s):
    return s.split()

In [7]:
# i've doubled weight for the title as it should be more important
text_data = df["title"] + ' ' + df["title"] + ' ' + df["body"]
vectorizerX = TfidfVectorizer(preprocessor=preprocessor, max_features=1000)
# print(type(title_data))
X = vectorizerX.fit_transform(text_data.values)

In [8]:
# now extract features for labels
# regular features, no tfidf this time

# maybe what I want is a CountVectorizer?
tag_data = df["tags"]
token_pattern = '(?u)\b\w+\b' # allow 1-letter tokens
vectorizerY = CountVectorizer(tokenizer=tokenizer, token_pattern=token_pattern,max_features=1000)
Y = vectorizerY.fit_transform(tag_data.values)

In [9]:
N = text_data.size
N


Out[9]:
60308

In [13]:
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X,Y,test_size=0.80, random_state=42)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape


Out[13]:
((12061, 1000), (48247, 1000), (12061, 1000), (48247, 1000))

In [14]:
clf = BinaryRelevance(GaussianNB())
clf.fit(X_train, Y_train)
# cv = cross_validation.ShuffleSplit(n=Y.shape[0],n_iter=3, test_size=0.3,random_state=0)

predictions = clf.predict(X_test)
score = f1_score(y_test,predictions)


/home/felipe/auto-tagger/venv2/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/home/felipe/auto-tagger/venv2/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [15]:
score


Out[15]:
0.076051253563532928

In [ ]: