In [2]:
import sys
sys.path.reverse()
from gensim import corpora, models, similarities

In [3]:
documents=open('www.mozilla.org.access.sample.log').readlines()[0:4]

In [4]:
documents[3]


Out[4]:
'177.12.141.252 www.mozilla.org - [31/Dec/2013:13:00:13 -0800] "GET /pt-BR/firefox/18.0.2/firstrun/ HTTP/1.1" 200 4005 "-" "Mozilla/5.0 (Windows NT 5.1; rv:18.0) Gecko/20100101 Firefox/18.0" "-"\n'

In [7]:
stoplist = set('- " / "-"'.split())
tokens = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents]

In [8]:
tokens


Out[8]:
[['91.205.131.2', '[31/dec/2013:13:00:13', '-0800]', '0'],
 ['41.102.162.229',
  'www.mozilla.org',
  '[31/dec/2013:13:00:13',
  '-0800]',
  '"get',
  'http/1.1"',
  '301',
  '584',
  '"mozilla/5.0',
  '(compatible;',
  'msie',
  '10.0;',
  'windows',
  'nt',
  '6.1;',
  'wow64;',
  'trident/6.0)"'],
 ['41.102.162.229',
  'www.mozilla.org',
  '[31/dec/2013:13:00:13',
  '-0800]',
  '"get',
  '/en-us/',
  'http/1.1"',
  '200',
  '11887',
  '"mozilla/5.0',
  '(compatible;',
  'msie',
  '10.0;',
  'windows',
  'nt',
  '6.1;',
  'wow64;',
  'trident/6.0)"'],
 ['177.12.141.252',
  'www.mozilla.org',
  '[31/dec/2013:13:00:13',
  '-0800]',
  '"get',
  '/pt-br/firefox/18.0.2/firstrun/',
  'http/1.1"',
  '200',
  '4005',
  '"mozilla/5.0',
  '(windows',
  'nt',
  '5.1;',
  'rv:18.0)',
  'gecko/20100101',
  'firefox/18.0"']]

In [ ]: