实时分词化

对文本进行分词化,级别:

  • 段落级别
  • 语句级别
  • 词级别

In [1]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from collections import defaultdict

In [5]:
# 使用一段
sentence = "now please allow me to introduce myself to you. my name is wangjia and imajored in traffic engineering. baoji is my hometown it is verybeautiful. and the people are very friendly."

sent_list = sent_tokenize(sentence)
print 'no sentence = %d' % (len(sent_list))
print 'sentences'
for sent in sent_list:
    print sent


no sentence = 4
sentences
now please allow me to introduce myself to you.
my name is wangjia and imajored in traffic engineering.
baoji is my hometown it is verybeautiful.
and the people are very friendly.

In [7]:
# 获取句子后,抽取词
word_dict = defaultdict(list)
for i,sent in enumerate(sent_list):
    word_dict[i].extend(word_tokenize(sent))

print word_dict


defaultdict(<type 'list'>, {0: ['now', 'please', 'allow', 'me', 'to', 'introduce', 'myself', 'to', 'you', '.'], 1: ['my', 'name', 'is', 'wangjia', 'and', 'imajored', 'in', 'traffic', 'engineering', '.'], 2: ['baoji', 'is', 'my', 'hometown', 'it', 'is', 'verybeautiful', '.'], 3: ['and', 'the', 'people', 'are', 'very', 'friendly', '.']})

删除停用词


In [9]:
from nltk.corpus import stopwords
stopwords.words('english')


Out[9]:
[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u'your',
 u'yours',
 u'yourself',
 u'yourselves',
 u'he',
 u'him',
 u'his',
 u'himself',
 u'she',
 u'her',
 u'hers',
 u'herself',
 u'it',
 u'its',
 u'itself',
 u'they',
 u'them',
 u'their',
 u'theirs',
 u'themselves',
 u'what',
 u'which',
 u'who',
 u'whom',
 u'this',
 u'that',
 u'these',
 u'those',
 u'am',
 u'is',
 u'are',
 u'was',
 u'were',
 u'be',
 u'been',
 u'being',
 u'have',
 u'has',
 u'had',
 u'having',
 u'do',
 u'does',
 u'did',
 u'doing',
 u'a',
 u'an',
 u'the',
 u'and',
 u'but',
 u'if',
 u'or',
 u'because',
 u'as',
 u'until',
 u'while',
 u'of',
 u'at',
 u'by',
 u'for',
 u'with',
 u'about',
 u'against',
 u'between',
 u'into',
 u'through',
 u'during',
 u'before',
 u'after',
 u'above',
 u'below',
 u'to',
 u'from',
 u'up',
 u'down',
 u'in',
 u'out',
 u'on',
 u'off',
 u'over',
 u'under',
 u'again',
 u'further',
 u'then',
 u'once',
 u'here',
 u'there',
 u'when',
 u'where',
 u'why',
 u'how',
 u'all',
 u'any',
 u'both',
 u'each',
 u'few',
 u'more',
 u'most',
 u'other',
 u'some',
 u'such',
 u'no',
 u'nor',
 u'not',
 u'only',
 u'own',
 u'same',
 u'so',
 u'than',
 u'too',
 u'very',
 u's',
 u't',
 u'can',
 u'will',
 u'just',
 u'don',
 u'should',
 u'now',
 u'd',
 u'll',
 u'm',
 u'o',
 u're',
 u've',
 u'y',
 u'ain',
 u'aren',
 u'couldn',
 u'didn',
 u'doesn',
 u'hadn',
 u'hasn',
 u'haven',
 u'isn',
 u'ma',
 u'mightn',
 u'mustn',
 u'needn',
 u'shan',
 u'shouldn',
 u'wasn',
 u'weren',
 u'won',
 u'wouldn']

In [12]:
import string
sentence = "now please allow me to introduce myself to you. my name is wangjia and imajored in traffic engineering. baoji is my hometown it is verybeautiful. and the people are very friendly."
words = word_tokenize(sentence)
print words
print len(words)


['now', 'please', 'allow', 'me', 'to', 'introduce', 'myself', 'to', 'you', '.', 'my', 'name', 'is', 'wangjia', 'and', 'imajored', 'in', 'traffic', 'engineering', '.', 'baoji', 'is', 'my', 'hometown', 'it', 'is', 'verybeautiful', '.', 'and', 'the', 'people', 'are', 'very', 'friendly', '.']
35

In [13]:
# 获取停用词
stop_words = stopwords.words('english')
# 将停用词过滤
words = [w for w in words if w not in stop_words]
print len(words)


17

In [14]:
# 标点符号
words = [w for w in words if w not in string.punctuation]
print len(words)


13