notebook.community

Edit and run



In [1]:

    
from __future__ import division

import re

from pandas import read_csv
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.naive_bayes import GaussianNB

from skmultilearn.problem_transform.br import BinaryRelevance



In [2]:

    
# the original file has been divided into 100 pieces
# aa is but one piece
df = read_csv("../data/pieces/aa",
    names=['id','title','body','tags'],
    header=None)



In [3]:

    
df.head()









    Out[3]:






  
    
      
      id
      title
      body
      tags
    
  
  
    
      0
      1
      How to check if an uploaded file is an image w...
      <p>I'd like to check if an uploaded file is an...
      php image-processing file-upload upload mime-t...
    
    
      1
      "2"
      How can I prevent firefox from closing when I ...
      <p>In my favorite editor (vim), I regularly us...
      firefox
    
    
      2
      "3"
      R Error Invalid type (list) for variable
      <p>I am import matlab file and construct a dat...
      r matlab machine-learning
    
    
      3
      "4"
      How do I replace special characters in a URL?
      <p>This is probably very simple, but I simply ...
      c# url encoding
    
    
      4
      "5"
      How to modify whois contact details?
      <pre><code>function modify(.......) {   $mcont...
      php api file-get-contents



In [4]:

    
df["id"]=df["id"].apply(lambda str: str.strip().lstrip('"').rstrip('"'))
df.head()









    Out[4]:






  
    
      
      id
      title
      body
      tags
    
  
  
    
      0
      1
      How to check if an uploaded file is an image w...
      <p>I'd like to check if an uploaded file is an...
      php image-processing file-upload upload mime-t...
    
    
      1
      2
      How can I prevent firefox from closing when I ...
      <p>In my favorite editor (vim), I regularly us...
      firefox
    
    
      2
      3
      R Error Invalid type (list) for variable
      <p>I am import matlab file and construct a dat...
      r matlab machine-learning
    
    
      3
      4
      How do I replace special characters in a URL?
      <p>This is probably very simple, but I simply ...
      c# url encoding
    
    
      4
      5
      How to modify whois contact details?
      <pre><code>function modify(.......) {   $mcont...
      php api file-get-contents



In [5]:

    
pat = re.compile('<[^>]+>')
def preprocessor(s):
    return pat.sub(' ',s).lower()

def tokenizer(s):
    return s.split()



In [7]:

    
# i've doubled weight for the title as it should be more important
text_data = df["title"] + ' ' + df["title"] + ' ' + df["body"]
vectorizerX = TfidfVectorizer(preprocessor=preprocessor, max_features=1000)
# print(type(title_data))
X = vectorizerX.fit_transform(text_data.values)



In [8]:

    
# now extract features for labels
# regular features, no tfidf this time

# maybe what I want is a CountVectorizer?
tag_data = df["tags"]
token_pattern = '(?u)\b\w+\b' # allow 1-letter tokens
vectorizerY = CountVectorizer(tokenizer=tokenizer, token_pattern=token_pattern,max_features=1000)
Y = vectorizerY.fit_transform(tag_data.values)



In [9]:

    
N = text_data.size
N









    Out[9]:





60308



In [13]:

    
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X,Y,test_size=0.80, random_state=42)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape









    Out[13]:





((12061, 1000), (48247, 1000), (12061, 1000), (48247, 1000))



In [14]:

    
clf = BinaryRelevance(GaussianNB())
clf.fit(X_train, Y_train)
# cv = cross_validation.ShuffleSplit(n=Y.shape[0],n_iter=3, test_size=0.3,random_state=0)

predictions = clf.predict(X_test)
score = f1_score(y_test,predictions)









    



/home/felipe/auto-tagger/venv2/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:756: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/home/felipe/auto-tagger/venv2/local/lib/python2.7/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)



In [15]:

    
score









    Out[15]:





0.076051253563532928



In [ ]:

	id	title	body	tags
0	1	How to check if an uploaded file is an image w...	<p>I'd like to check if an uploaded file is an...	php image-processing file-upload upload mime-t...
1	"2"	How can I prevent firefox from closing when I ...	<p>In my favorite editor (vim), I regularly us...	firefox
2	"3"	R Error Invalid type (list) for variable	<p>I am import matlab file and construct a dat...	r matlab machine-learning
3	"4"	How do I replace special characters in a URL?	<p>This is probably very simple, but I simply ...	c# url encoding
4	"5"	How to modify whois contact details?	<pre><code>function modify(.......) { $mcont...	php api file-get-contents