In [1]:
import logging
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.metrics import f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron

logging.basicConfig()

print 'start download newsgroup'

#categories = ['rec.sport.hockey', 'rec.sport.baseball', 'rec.autos']
categories = ['rec.sport.hockey']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

print 'download newsgroup done'

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

print 'transform data done, start training'

classifier = Perceptron(n_iter=100, eta0=0.1)
classifier.fit_transform(X_train, newsgroups_train.target)
predictions = classifier.predict(X_test)

print classification_report(newsgroups_test.target, predictions)


/home/moonbury/.local/lib/python2.7/site-packages/sklearn/metrics/metrics.py:4: DeprecationWarning: sklearn.metrics.metrics is deprecated and will be removed in 0.18. Please import from sklearn.metrics
  DeprecationWarning)
start download newsgroup
---------------------------------------------------------------------------
URLError                                  Traceback (most recent call last)
<ipython-input-1-b255f920c33b> in <module>()
     11 #categories = ['rec.sport.hockey', 'rec.sport.baseball', 'rec.autos']
     12 categories = ['rec.sport.hockey']
---> 13 newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
     14 newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))
     15 

/home/moonbury/.local/lib/python2.7/site-packages/sklearn/datasets/twenty_newsgroups.pyc in fetch_20newsgroups(data_home, subset, categories, shuffle, random_state, remove, download_if_missing)
    221         if download_if_missing:
    222             cache = download_20newsgroups(target_dir=twenty_home,
--> 223                                           cache_path=cache_path)
    224         else:
    225             raise IOError('20Newsgroups dataset not found')

/home/moonbury/.local/lib/python2.7/site-packages/sklearn/datasets/twenty_newsgroups.pyc in download_20newsgroups(target_dir, cache_path)
     89 
     90     logger.warning("Downloading dataset from %s (14 MB)", URL)
---> 91     opener = urlopen(URL)
     92     with open(archive_path, 'wb') as f:
     93         f.write(opener.read())

/usr/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout)
    125     if _opener is None:
    126         _opener = build_opener()
--> 127     return _opener.open(url, data, timeout)
    128 
    129 def install_opener(opener):

/usr/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
    402             req = meth(req)
    403 
--> 404         response = self._open(req, data)
    405 
    406         # post-process response

/usr/lib/python2.7/urllib2.pyc in _open(self, req, data)
    420         protocol = req.get_type()
    421         result = self._call_chain(self.handle_open, protocol, protocol +
--> 422                                   '_open', req)
    423         if result:
    424             return result

/usr/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
    380             func = getattr(handler, meth_name)
    381 
--> 382             result = func(*args)
    383             if result is not None:
    384                 return result

/usr/lib/python2.7/urllib2.pyc in http_open(self, req)
   1212 
   1213     def http_open(self, req):
-> 1214         return self.do_open(httplib.HTTPConnection, req)
   1215 
   1216     http_request = AbstractHTTPHandler.do_request_

/usr/lib/python2.7/urllib2.pyc in do_open(self, http_class, req)
   1182         except socket.error, err: # XXX what error?
   1183             h.close()
-> 1184             raise URLError(err)
   1185         else:
   1186             try:

URLError: <urlopen error [Errno 111] Connection refused>

In [ ]: