In [1]:
import nltk

In [4]:
nltk.corpus.gutenberg.fileids()


Out[4]:
[u'austen-emma.txt',
 u'austen-persuasion.txt',
 u'austen-sense.txt',
 u'bible-kjv.txt',
 u'blake-poems.txt',
 u'bryant-stories.txt',
 u'burgess-busterbrown.txt',
 u'carroll-alice.txt',
 u'chesterton-ball.txt',
 u'chesterton-brown.txt',
 u'chesterton-thursday.txt',
 u'edgeworth-parents.txt',
 u'melville-moby_dick.txt',
 u'milton-paradise.txt',
 u'shakespeare-caesar.txt',
 u'shakespeare-hamlet.txt',
 u'shakespeare-macbeth.txt',
 u'whitman-leaves.txt']

In [8]:
emma_raw = nltk.corpus.gutenberg.raw("austen-emma.txt")
print(len(emma_raw))


887071

In [10]:
emma_sents = nltk.corpus.gutenberg.sents("austen-emma.txt")
emma_sents


Out[10]:
[[u'[', u'Emma', u'by', u'Jane', u'Austen', u'1816', u']'], [u'VOLUME', u'I'], ...]

In [12]:
emma_paras = nltk.corpus.gutenberg.paras("austen-emma.txt")
len(emma_paras)


Out[12]:
2371

In [14]:
emma_words = nltk.corpus.gutenberg.words("austen-emma.txt")
len(emma_words)


Out[14]:
192427

In [15]:
from nltk.tokenize import word_tokenize
word_tokenize(emma_raw[50:100])


Out[15]:
[u'Emma',
 u'Woodhouse',
 u',',
 u'handsome',
 u',',
 u'clever',
 u',',
 u'and',
 u'rich',
 u',',
 u'with',
 u'a']

In [16]:
from nltk.tokenize import RegexpTokenizer
t = RegexpTokenizer("[\w]+")
t.tokenize(emma_raw[50:100])


Out[16]:
[u'Emma', u'Woodhouse', u'handsome', u'clever', u'and', u'rich', u'with', u'a']

In [17]:
from nltk.tokenize import sent_tokenize
print(sent_tokenize(emma_raw[:1000]))[3]


Sixteen years had Miss Taylor been in Mr. Woodhouse's family,
less as a governess than a friend, very fond of both daughters,
but particularly of Emma.

In [28]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
st.stem("drinking")


Out[28]:
u'drink'

In [38]:
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()
print(lm.lemmatize("eat"))
print(lm.lemmatize("eaten", pos='v'))
print(lm.lemmatize("runner"))


eat
eat
runner

In [39]:
from nltk.tag import pos_tag

tagged_list = pos_tag(word_tokenize(emma_raw[:100]))
tagged_list


Out[39]:
[(u'[', 'NNS'),
 (u'Emma', 'NNP'),
 (u'by', 'IN'),
 (u'Jane', 'NNP'),
 (u'Austen', 'NNP'),
 (u'1816', 'CD'),
 (u']', 'NNP'),
 (u'VOLUME', 'NNP'),
 (u'I', 'PRP'),
 (u'CHAPTER', 'VBP'),
 (u'I', 'PRP'),
 (u'Emma', 'NNP'),
 (u'Woodhouse', 'NNP'),
 (u',', ','),
 (u'handsome', 'NN'),
 (u',', ','),
 (u'clever', 'NN'),
 (u',', ','),
 (u'and', 'CC'),
 (u'rich', 'JJ'),
 (u',', ','),
 (u'with', 'IN'),
 (u'a', 'DT')]

In [40]:
from konlpy.corpus import kolaw

In [41]:
kolaw.fileids()


Out[41]:
['constitution.txt']

In [45]:
c = kolaw.open('constitution.txt').read()
print(c[:100])


대한민국헌법

유구한 역사와 전통에 빛나는 우리 대한국민은 3·1운동으로 건립된 대한민국임시정부의 법통과 불의에 항거한 4·19민주이념을 계승하고, 조국의 민주개혁과 평화적 통일의

In [46]:
from konlpy.corpus import kobill
kobill.fileids()


Out[46]:
['1809894.txt',
 '1809899.txt',
 '1809895.txt',
 '1809892.txt',
 '1809897.txt',
 '1809890.txt',
 '1809898.txt',
 '1809891.txt',
 '1809896.txt',
 '1809893.txt']

In [47]:
d = kobill.open('1809894.txt').read()
print(d[:100])


고등교육법 일부개정법률안

(안상수의원 대표발의 )

 의 안
 번 호

9894

발의연월일 : 2010.  11.  15.

발  의  자 : 안상수․김정훈․원희목 

강석호․

In [48]:
x = [u"한글", {u"한글 키" : [u"한글 밸류1", u"한글 밸류2"]}]
print(x)


[u'\ud55c\uae00', {u'\ud55c\uae00 \ud0a4': [u'\ud55c\uae00 \ubc38\ub9581', u'\ud55c\uae00 \ubc38\ub9582']}]

In [49]:
from konlpy.utils import pprint
pprint(x)


[한글,
 {한글 키: [한글 밸류1, 한글 밸류2]}]

In [50]:
x[0]


Out[50]:
u'\ud55c\uae00'

In [53]:
print(x[0])


한글

In [54]:
from konlpy.tag import *
hannanum = Hannanum()
kkma = Kkma()
twitter = Twitter()

In [55]:
pprint(kkma.nouns(c[:65]))


[대한,
 대한민국,
 대한민국헌법,
 민국,
 헌법,
 유구,
 역사,
 전통,
 우리,
 국민,
 3,
 1,
 1운동,
 운동,
 건립,
 대한민국임시정부,
 임시,
 정부,
 법통,
 불의,
 항거]

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
corpus=[
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',
    
]

vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-56-2521dbd5b6af> in <module>()
----> 1 from sklearn.feature_extraction.text import CountVectorizer
      2 corpus=[
      3     'This is the first document.',
      4     'This is the second second document.',
      5     'And the third one.',

ImportError: No module named sklearn.feature_extraction.text

In [57]:
vect.transform(['This is the second document.']).toarray()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-57-4f4ac1a7f482> in <module>()
----> 1 vect.transform(['This is the second document.']).toarray()

NameError: name 'vect' is not defined

In [58]:
vect.transform(['Something completely new.']).toarray()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-58-3fd5287ad9df> in <module>()
----> 1 vect.transform(['Something completely new.']).toarray()

NameError: name 'vect' is not defined

In [59]:
vect.transform(corpus).toarray()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-59-3d48e116bc9c> in <module>()
----> 1 vect.transform(corpus).toarray()

NameError: name 'vect' is not defined

In [ ]: