notebook.community

Edit and run



In [1]:

    
import nltk



In [4]:

    
nltk.corpus.gutenberg.fileids()









    Out[4]:





[u'austen-emma.txt',
 u'austen-persuasion.txt',
 u'austen-sense.txt',
 u'bible-kjv.txt',
 u'blake-poems.txt',
 u'bryant-stories.txt',
 u'burgess-busterbrown.txt',
 u'carroll-alice.txt',
 u'chesterton-ball.txt',
 u'chesterton-brown.txt',
 u'chesterton-thursday.txt',
 u'edgeworth-parents.txt',
 u'melville-moby_dick.txt',
 u'milton-paradise.txt',
 u'shakespeare-caesar.txt',
 u'shakespeare-hamlet.txt',
 u'shakespeare-macbeth.txt',
 u'whitman-leaves.txt']



In [8]:

    
emma_raw = nltk.corpus.gutenberg.raw("austen-emma.txt")
print(len(emma_raw))



In [10]:

    
emma_sents = nltk.corpus.gutenberg.sents("austen-emma.txt")
emma_sents









    Out[10]:





[[u'[', u'Emma', u'by', u'Jane', u'Austen', u'1816', u']'], [u'VOLUME', u'I'], ...]



In [12]:

    
emma_paras = nltk.corpus.gutenberg.paras("austen-emma.txt")
len(emma_paras)









    Out[12]:





2371



In [14]:

    
emma_words = nltk.corpus.gutenberg.words("austen-emma.txt")
len(emma_words)









    Out[14]:





192427



In [15]:

    
from nltk.tokenize import word_tokenize
word_tokenize(emma_raw[50:100])









    Out[15]:





[u'Emma',
 u'Woodhouse',
 u',',
 u'handsome',
 u',',
 u'clever',
 u',',
 u'and',
 u'rich',
 u',',
 u'with',
 u'a']



In [16]:

    
from nltk.tokenize import RegexpTokenizer
t = RegexpTokenizer("[\w]+")
t.tokenize(emma_raw[50:100])









    Out[16]:





[u'Emma', u'Woodhouse', u'handsome', u'clever', u'and', u'rich', u'with', u'a']



In [17]:

    
from nltk.tokenize import sent_tokenize
print(sent_tokenize(emma_raw[:1000]))[3]









    



Sixteen years had Miss Taylor been in Mr. Woodhouse's family,
less as a governess than a friend, very fond of both daughters,
but particularly of Emma.



In [28]:

    
from nltk.stem import PorterStemmer
st = PorterStemmer()
st.stem("drinking")









    Out[28]:





u'drink'



In [38]:

    
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()
print(lm.lemmatize("eat"))
print(lm.lemmatize("eaten", pos='v'))
print(lm.lemmatize("runner"))









    



eat
eat
runner



In [39]:

    
from nltk.tag import pos_tag

tagged_list = pos_tag(word_tokenize(emma_raw[:100]))
tagged_list









    Out[39]:





[(u'[', 'NNS'),
 (u'Emma', 'NNP'),
 (u'by', 'IN'),
 (u'Jane', 'NNP'),
 (u'Austen', 'NNP'),
 (u'1816', 'CD'),
 (u']', 'NNP'),
 (u'VOLUME', 'NNP'),
 (u'I', 'PRP'),
 (u'CHAPTER', 'VBP'),
 (u'I', 'PRP'),
 (u'Emma', 'NNP'),
 (u'Woodhouse', 'NNP'),
 (u',', ','),
 (u'handsome', 'NN'),
 (u',', ','),
 (u'clever', 'NN'),
 (u',', ','),
 (u'and', 'CC'),
 (u'rich', 'JJ'),
 (u',', ','),
 (u'with', 'IN'),
 (u'a', 'DT')]



In [40]:

    
from konlpy.corpus import kolaw



In [41]:

    
kolaw.fileids()









    Out[41]:





['constitution.txt']



In [45]:

    
c = kolaw.open('constitution.txt').read()
print(c[:100])









    



대한민국헌법

유구한 역사와 전통에 빛나는 우리 대한국민은 3·1운동으로 건립된 대한민국임시정부의 법통과 불의에 항거한 4·19민주이념을 계승하고, 조국의 민주개혁과 평화적 통일의



In [46]:

    
from konlpy.corpus import kobill
kobill.fileids()









    Out[46]:





['1809894.txt',
 '1809899.txt',
 '1809895.txt',
 '1809892.txt',
 '1809897.txt',
 '1809890.txt',
 '1809898.txt',
 '1809891.txt',
 '1809896.txt',
 '1809893.txt']



In [47]:

    
d = kobill.open('1809894.txt').read()
print(d[:100])









    



고등교육법 일부개정법률안

(안상수의원 대표발의 )

 의 안
 번 호

9894

발의연월일 : 2010.  11.  15.

발  의  자 : 안상수․김정훈․원희목 

강석호․



In [48]:

    
x = [u"한글", {u"한글 키" : [u"한글 밸류1", u"한글 밸류2"]}]
print(x)









    



[u'\ud55c\uae00', {u'\ud55c\uae00 \ud0a4': [u'\ud55c\uae00 \ubc38\ub9581', u'\ud55c\uae00 \ubc38\ub9582']}]



In [49]:

    
from konlpy.utils import pprint
pprint(x)









    



[한글,
 {한글 키: [한글 밸류1, 한글 밸류2]}]



In [50]:

    
x[0]









    Out[50]:





u'\ud55c\uae00'



In [53]:

    
print(x[0])



In [54]:

    
from konlpy.tag import *
hannanum = Hannanum()
kkma = Kkma()
twitter = Twitter()



In [55]:

    
pprint(kkma.nouns(c[:65]))









    



[대한,
 대한민국,
 대한민국헌법,
 민국,
 헌법,
 유구,
 역사,
 전통,
 우리,
 국민,
 3,
 1,
 1운동,
 운동,
 건립,
 대한민국임시정부,
 임시,
 정부,
 법통,
 불의,
 항거]



In [56]:

    
from sklearn.feature_extraction.text import CountVectorizer
corpus=[
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',
    
]

vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-56-2521dbd5b6af> in <module>()
----> 1 from sklearn.feature_extraction.text import CountVectorizer
      2 corpus=[
      3     'This is the first document.',
      4     'This is the second second document.',
      5     'And the third one.',

ImportError: No module named sklearn.feature_extraction.text



In [57]:

    
vect.transform(['This is the second document.']).toarray()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-57-4f4ac1a7f482> in <module>()
----> 1 vect.transform(['This is the second document.']).toarray()

NameError: name 'vect' is not defined



In [58]:

    
vect.transform(['Something completely new.']).toarray()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-58-3fd5287ad9df> in <module>()
----> 1 vect.transform(['Something completely new.']).toarray()

NameError: name 'vect' is not defined



In [59]:

    
vect.transform(corpus).toarray()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-59-3d48e116bc9c> in <module>()
----> 1 vect.transform(corpus).toarray()

NameError: name 'vect' is not defined



In [ ]: