In [1]:
import nltk
In [4]:
nltk.corpus.gutenberg.fileids()
Out[4]:
In [8]:
emma_raw = nltk.corpus.gutenberg.raw("austen-emma.txt")
print(len(emma_raw))
In [10]:
emma_sents = nltk.corpus.gutenberg.sents("austen-emma.txt")
emma_sents
Out[10]:
In [12]:
emma_paras = nltk.corpus.gutenberg.paras("austen-emma.txt")
len(emma_paras)
Out[12]:
In [14]:
emma_words = nltk.corpus.gutenberg.words("austen-emma.txt")
len(emma_words)
Out[14]:
In [15]:
from nltk.tokenize import word_tokenize
word_tokenize(emma_raw[50:100])
Out[15]:
In [16]:
from nltk.tokenize import RegexpTokenizer
t = RegexpTokenizer("[\w]+")
t.tokenize(emma_raw[50:100])
Out[16]:
In [17]:
from nltk.tokenize import sent_tokenize
print(sent_tokenize(emma_raw[:1000]))[3]
In [28]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
st.stem("drinking")
Out[28]:
In [38]:
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()
print(lm.lemmatize("eat"))
print(lm.lemmatize("eaten", pos='v'))
print(lm.lemmatize("runner"))
In [39]:
from nltk.tag import pos_tag
tagged_list = pos_tag(word_tokenize(emma_raw[:100]))
tagged_list
Out[39]:
In [40]:
from konlpy.corpus import kolaw
In [41]:
kolaw.fileids()
Out[41]:
In [45]:
c = kolaw.open('constitution.txt').read()
print(c[:100])
In [46]:
from konlpy.corpus import kobill
kobill.fileids()
Out[46]:
In [47]:
d = kobill.open('1809894.txt').read()
print(d[:100])
In [48]:
x = [u"한글", {u"한글 키" : [u"한글 밸류1", u"한글 밸류2"]}]
print(x)
In [49]:
from konlpy.utils import pprint
pprint(x)
In [50]:
x[0]
Out[50]:
In [53]:
print(x[0])
In [54]:
from konlpy.tag import *
hannanum = Hannanum()
kkma = Kkma()
twitter = Twitter()
In [55]:
pprint(kkma.nouns(c[:65]))
In [56]:
from sklearn.feature_extraction.text import CountVectorizer
corpus=[
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
'The last document?',
]
vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_
In [57]:
vect.transform(['This is the second document.']).toarray()
In [58]:
vect.transform(['Something completely new.']).toarray()
In [59]:
vect.transform(corpus).toarray()
In [ ]: