Exercise 1)


In [1]:
s = 'colorless'
s = s[:4] + 'u' + s[4:]
s


Out[1]:
'colourless'

Exercise 2)


In [2]:
'dishes'[:-2]


Out[2]:
'dish'

In [3]:
'running'[:-4]


Out[3]:
'run'

In [4]:
'nationality'[:-5]


Out[4]:
'nation'

In [5]:
'undo'[2:]


Out[5]:
'do'

In [6]:
'preheat'[3:]


Out[6]:
'heat'

Exercise 3)


In [1]:
'in'[-5]



IndexErrorTraceback (most recent call last)
<ipython-input-1-16dacbda7e0f> in <module>()
----> 1 'in'[-5]

IndexError: string index out of range

Exercise 4)


In [8]:
monty = 'Monty Python'
monty[6:11:2]


Out[8]:
'Pto'

In [9]:
monty[10:5:-2]


Out[9]:
'otP'

In [10]:
monty[1:10:-2]


Out[10]:
''

In [2]:
monty[1:6:1.5]



NameErrorTraceback (most recent call last)
<ipython-input-2-563dffb3b494> in <module>()
----> 1 monty[1:6:1.5]

NameError: name 'monty' is not defined

Exercise 5)


In [12]:
monty[::-1]


Out[12]:
'nohtyP ytnoM'

Exercise 6)


In [13]:
from __future__ import division
import nltk, re, pprint

In [14]:
# a - one or more letters
nltk.re_show(r'[a-zA-Z]+', monty)


{Monty} {Python}

In [15]:
# b - one capital letter and zero or more lowercase letters
nltk.re_show(r'[A-Z][a-z]*', monty)
nltk.re_show(r'[A-Z][a-z]*', 'A very Intersting3 example')


{Monty} {Python}
{A} very {Intersting}3 example

In [16]:
# c - a word starting with p, followed by 0 up to 2 vowels and ending with p
nltk.re_show(r'p[aeiou]{,2}t', 'two pouting party pets - pt')


two {pout}ing party {pet}s - {pt}

In [17]:
# d - integer or decimal number
nltk.re_show(r'\d+(\.\d+)?', 'This should match 23 as well as 1.093 and 999.9')


This should match {23} as well as {1.093} and {999.9}

In [18]:
# e - zero or more sequences of not-a-vowel - vowel - not-a-vowel
nltk.re_show(r'([^aeiou][aeiou][^aeiou])*', 'This should match pet as well as cut and lol')


{}T{his} {}s{}h{}o{}u{}l{}d{} {mat}c{}h{} {pet as} {wel}l{ as} {cut an}d{} {lol}

In [19]:
# f - one or more alphanumeric characters or one or more charcters that are neither alpahnumeric nor whitespace
nltk.re_show(r'\w+|[^\w\s]+', 'should match me but not \n')


{should} {match} {me} {but} {not}

Exercise 7)


In [20]:
a = r'^(the|a|an)$'
nltk.re_show(a, 'the something')
nltk.re_show(a, 'the')
nltk.re_show(a, 'an')
nltk.re_show(a, 'anything')


the something
{the}
{an}
anything

In [21]:
b = r'\d+([\+\*]\d+)+'
nltk.re_show(b, 'something+2')
nltk.re_show(b, '2*3+8')
nltk.re_show(b, '200+5000')
nltk.re_show(b, '2*3+8-5/6')


something+2
{2*3+8}
{200+5000}
{2*3+8}-5/6

Exercise 8)


In [22]:
from bs4 import BeautifulSoup
import urllib

def getContentFromURL(url):
    raw = urllib.urlopen(url).read()
    soup = BeautifulSoup(raw)
    return soup.get_text()

getContentFromURL('http://www.nltk.org/')


c:\python27\lib\site-packages\bs4\__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 174 of the file c:\python27\lib\runpy.py. To get rid of this warning, change code that looks like this:

 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))
Out[22]:
u'\n\n\n\nNatural Language Toolkit \u2014 NLTK 3.0 documentation\n\n\n\n      var DOCUMENTATION_OPTIONS = {\n        URL_ROOT:    \'./\',\n        VERSION:     \'3.0\',\n        COLLAPSE_INDEX: false,\n        FILE_SUFFIX: \'.html\',\n        HAS_SOURCE:  true\n      };\n    \n\n\n\n\n\n\n\n\n\nNLTK 3.0 documentation\n\nnext |\n          modules |\n          index\n\n\n\n\n\n\n\n\n\n\nNatural Language Toolkit\xb6\nNLTK is a leading platform for building Python programs to work with human language data.\nIt provides easy-to-use interfaces to over 50 corpora and lexical\nresources such as WordNet,\nalong with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning,\nwrappers for industrial-strength NLP libraries,\nand an active discussion forum.\nThanks to a hands-on guide introducing programming fundamentals alongside topics in computational linguistics, plus comprehensive API documentation,\nNLTK is suitable for linguists, engineers, students, educators, researchers, and industry users alike.\nNLTK is available for Windows, Mac OS X, and Linux. Best of all, NLTK is a free, open source, community-driven project.\nNLTK has been called \u201ca wonderful tool for teaching, and working in, computational linguistics using Python,\u201d\nand \u201can amazing library to play with natural language.\u201d\nNatural Language Processing with Python provides a practical\nintroduction to programming for language processing.\nWritten by the creators of NLTK, it guides the reader through the fundamentals\nof writing Python programs, working with corpora, categorizing text, analyzing linguistic structure,\nand more.\nThe book is being updated for Python 3 and NLTK 3.\n(The original Python 2 version is still available at http://nltk.org/book_1ed.)\n\nSome simple things you can do with NLTK\xb6\nTokenize and tag some text:\n>>> import nltk\n>>> sentence = """At eight o\'clock on Thursday morning\n... Arthur didn\'t feel very good."""\n>>> tokens = nltk.word_tokenize(sentence)\n>>> tokens\n[\'At\', \'eight\', "o\'clock", \'on\', \'Thursday\', \'morning\',\n\'Arthur\', \'did\', "n\'t", \'feel\', \'very\', \'good\', \'.\']\n>>> tagged = nltk.pos_tag(tokens)\n>>> tagged[0:6]\n[(\'At\', \'IN\'), (\'eight\', \'CD\'), ("o\'clock", \'JJ\'), (\'on\', \'IN\'),\n(\'Thursday\', \'NNP\'), (\'morning\', \'NN\')]\n\n\nIdentify named entities:\n>>> entities = nltk.chunk.ne_chunk(tagged)\n>>> entities\nTree(\'S\', [(\'At\', \'IN\'), (\'eight\', \'CD\'), ("o\'clock", \'JJ\'),\n           (\'on\', \'IN\'), (\'Thursday\', \'NNP\'), (\'morning\', \'NN\'),\n       Tree(\'PERSON\', [(\'Arthur\', \'NNP\')]),\n           (\'did\', \'VBD\'), ("n\'t", \'RB\'), (\'feel\', \'VB\'),\n           (\'very\', \'RB\'), (\'good\', \'JJ\'), (\'.\', \'.\')])\n\n\nDisplay a parse tree:\n>>> from nltk.corpus import treebank\n>>> t = treebank.parsed_sents(\'wsj_0001.mrg\')[0]\n>>> t.draw()\n\n\n\nNB. If you publish work that uses NLTK, please cite the NLTK book as\nfollows:\n\nBird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python.  O\u2019Reilly Media Inc.\n\n\nNext Steps\xb6\n\nsign up for release announcements\njoin in the discussion\n\n\n\n\nContents\xb6\n\n\nNLTK News\nInstalling NLTK\nInstalling NLTK Data\nContribute to NLTK\nFAQ\nWiki\nAPI\nHOWTO\n\n\n\nIndex\nModule Index\nSearch Page\n\n\n\n\n\n\n\nTable Of Contents\n\nNLTK News\nInstalling NLTK\nInstalling NLTK Data\nContribute to NLTK\nFAQ\nWiki\nAPI\nHOWTO\n\n\nSearch\n\n\n\n\n\n\n\n                Enter search terms or a module, class or function name.\n            \n\n\n\n\n\n\n\n\n\nnext |\n            modules |\n            index\n\n\n\nShow Source\n\n\n\n\n        \xa9 Copyright 2015, NLTK Project.\n      Last updated on Apr 09, 2016.\n      Created using Sphinx 1.3.1.\n    \n\n\n\n\n\n'

Exercise 9)


In [23]:
def load(fileName):
    f = open(fileName + '.txt')
    return f.read()
corpusText = load('corpus')

In [24]:
# a
pattern = r'''(?x)
    [\.,;"'?\(\):\-_`\[\]\{\}]+ # one or more punctuation symbols, brackets etc.
'''
print nltk.regexp_tokenize(corpusText, pattern)


['-', '-', '-', ':', '-', "'", '-', '(', ')', ',', '.', '.', '.', '-', '.', '-', '-', '.', '.', '.', '.', '.', ':', '-', '-', ':', '-', '.', '-', ',', ',', ',', ',', '.', ',', '-', ',', ',', '.', '-', '-', ':', '-', ',', '-', ',', '-', '-', ',', '.', '-', '-', '.', '-', '-', ',', '-', '.', ',', '.', ',', ',', '-', ',', '.', ',', '-', '.', "'", '.', "'", '.', ',', '.', '.', ',', ',', '.', ',', '.', ',', '.', ',', '.', ',', ',', ',', ',', '.', ',', ',', ',', ',', '.', ',', '.', ',', ',', "'", "'", '(', ')', '.', "'", ',', '.', '-', '.', '.', ',', ',', '.', '.', '.', "'", "'", '.', "'", "'", '-', '.', '.', ',', '.', ',', ',', '(', ').', ',', '-', '.', "'", "'", ',', ',', '(', ',', ')', "'", '.', ',', ',', '-', '-', ',', '.', ',', '.', '-', '-', '.', ',', ',', ',', '.', '-', ',', ',', ',', ',', '(', '_', '_', '.', ')', '.', '[', '].', ',', ',', ',', '.', ',', '(', '.', ')', ',', '(', '_', '.', ')', '.', ',', '(', '_', '.', ')', '-', '.', '.', ',', ',', ',', '(', ').', ',', '.', '.', '.', ',', ':', ',', "'", "',", "'", "',", "'", "',", "'", "'", ',', '.', "('", "',", "'", "',", "'", "'),", "('", "',", "'", "',", "'", "',", "'", "')", '.', ',', ',', ',', ',', '.', "'", "'", ',', '.', ',', ',', ',', '.', ',', ',', '.', '-', '-', ',', '.', ',', ',', ',', '.', "'", "'", ',', "'", "'", '.', '.', ',', ',', "'", "'", "'", "',", ',', ',', '.', ',', '.', ',', '(', '-', '-', ')', '.', ',', ',', "'", '.', '.', '(', ',', ')', '(', ',', ',', ',', ',', ',', ',', '.),', '-', '-', '(', ')', '.', ',', '-', "'", "'", ',', "'", "'", ',', "'", "'", ',', '[', '].', '-', ':', '(', '_', '.', ')', "'", '_', "',", "'", "',", "'", "',", "'", "'.", '(', ')', '(', ',', ',', '-', ').', ',', ':', ',', "'", '_', "'", ',', "'", "'", ',', "'", "'.", ',', ',', '-', "'", "'", '(', '-', "'", "'", ',', '-', '-', ',', "'", "'", "'", "'", ').', '-', '.', ',', ',', "'", "'", ',', '.', ',', ',', ',', "('", "',", "'", "'", '.).', '.', '(', '.', '.', ',', '),', ',', '.', ',', '.', '.', ',', '.', ',', ',', '.', ',', '.', '(', '_', '.', ')', '-', '.', ',', ',', '.', ':', ',', ',', '(', ').', ',', '(', ',', ').', ',', '.', ',', '(', ')', ',', '.', ',', '(', ').', ',', ',', '.', '.', ',', '(', '),', '.', '(', '),', '.', ',', '(', '),', ':', ',', '.', ',', '.', ',', ',', '.', ',', ',', '.', ',', ',', '(', ')', '(', ').', ',', ',', ',', '[', ']', '.', ',', ',', '.', ',', ',', ',', ',', '.', ',', ',', '.', ',', ',', '.', ',', '.', ',', '(', ').', ',', '.', ',', ',', '.', ',', '.', ',', ',', ',', ',', '.', '.', '-', '-', ',', ',', '.', '-', ',', ',', '.', ',', ',', ',', '.', ',', ',', '.', ',', ',', ',', ':', ',', ',', '(', '),', '.', ',', ',', '(', '),', '.', ',', ',', '(', ',', ').', ',', '-', ',', '.', ',', '.', '-', ',', '(', ')', '.', ',', ':', '(', ')', '(', ')', '[', '].', ',', '.', ',', '.', ',', "'", "'", '.', ',', '-', '-', '.', ',', ',', ',', '.', ',', '.', ',', ',', ',', '.', '.', '-', '-', '-', '-', '[', '].', ',', '-', '-', '.', ',', ',', ':', '(', ')', "'", "',", ',', "'", "'", "'", "'.", ',', ',', ',', '(', ',', '),', ',', '.', ',', "'", '.', '(', '),', ',', ':', "'", "'", "'", "'", '(', ',', ').', ',', '(', '),', ',', '.', "'", "'", '.', ',', "'", "'", ':', '-', '(', '),', ',', '.', '-', '-', '-', ',', ',', '.', ',', ',', ',', '.', ',', ':', ',', ',', ',', '.', ',', "'", ',', '.', ',', '.', ',', '.', '-', '.', ',', "'", "',", ',', ',', '.', "'", '_', "'", ',', ',', ',', '.', ',', '-', '.', "'", "'", '.', ',', "'", '_', "'", "'", "'", '.', ',', '-', ',', ',', '-', '.', ',', ',', ',', '.', '-', '-', ':', '(', "'", '_', "')", '(', "'", '_', "').", ',', ',', ',', ',', ',', '.', '-', '.', '(', '.', '),', '-', "('", '_', "')", '-', '-', '.', "('", '_', "')", ',', '(', '),', ',', '.', ',', "('", '_', "')", "'", '.', ',', '.', "'", '_', "'", ',', ',', ',', ',', '-', '-', ',', '(', ',', '),', '.', ',', "'", '_', "',", '-', '-', ',', '.', ',', ',', "'", '_', "'", ',', '.', ',', ',', '.', ',', ',', '(', ',', ').', ',', '.', ',', ',', ',', '-', '.', '.', '.', '.', ',', '.', ',', '.', '.', ',', ',', "'", '.', '.', "'", "'", ',', '.', ',', ',', ',', '.', ',', ':', ',', ',', ',', ',', '.', ',', '.', ',', ',', ',', ',', ',', ',', '-', '.', '.', '(', ')', ',', '.', '.', ':', "'", '.', ',', '.', ',', '(', ')', '-', '.', ',', '-', ',', "'", "'", '.', '.', '-', ',', "'", '.', "'", ',', '.', ',', ',', ',', '.', ',', ',', ',', ',', ',', '.', ',', ',', '.', '-', ',', ',', '.', ',', '[', '].', ',', "'", '.', ',', '(', ').', ',', '.', "'", "'", '-', ',', '(', ').', '.', '-', ':', ',', '.', '.', ',', '.', ',', ',', "'", "',", '.', "'", "'", "'", "'", ',', "'", '.', ',', '.', "'", "'", '(', "'", '),', '(', '.', '.', '),', ',', ',', '.', ',', '.', ',', "'", "'", '.', ',', '.', ',', ',', ',', ',', ',', '.', '.', '-', ',', ',', ',', '.', '(', ',', '.', '),', ',', '.', '[', ']', '.', ',', ':', "('", "',", "'", "',", "'", "'),", ',', "'", "'", '.', ',', ',', '.', ',', '.', ',', '.', '(', ')', ',', "'", "'", "'", '.', ',', ',', '-', ',', ',', '.', ',', ',', '.', ',', "'", '.', ',', ',', "'", "'", '.', '.', ',', "'", '(', ').', ',', '.', '(', ')', '.', '.', ',', ',', "'", "'", '[', '],', '.', '(', ')', ',', '.', ',', '-', ',', "'", "'", ',', '(', '[', '],', ').', ',', ',', ',', '.', ',', ',', ',', '.', ',', ',', '.', '-', ',', '-', '-', ',', ',', '.', ',', '-', '-', ',', ',', ',', ',', '-', '.', '(', ')', '[', ']', ':', '.', '.', '.', ',', '.', '[', ']', ':', '.', '.', ',', '.', '[', ']', ':', '.', '.', ',', '.', '[', ']', ':', '.', '.', '-', '-', '-', '.', ',', '.', '[', ']', ':', '.', '.', '-', '-', '.', ',', '.', '[', ']', ':', '.', '.', ',', '.', '[', ']', ':', '.', '.', '-', '-', '-', '-', '.', ',', '.', '[', ']', ':', '.', '.', '-', ',', '.', '[', ']', ':', '.', '.', '-', '-', '-', '-', '-', '-', '.', ',', '.', '(', ')', ':', '.', '.', ',', '.', ':', '.', ',', '.', ':', '.', '.', ',', '.', ':', '.', '.', '-', ',', '.', ':', '(', '.', '.)', ':', '-', '-', '-', ':', '-', '.', ',', '.', '.', '.', ',', ',', ',', '.', ',', '.', '-', ':', ':', ':', '.', '.', '.', '-', '.', '.', '.', '.', '.', '.', '.', '-', '(', ').', ',', '.', ',', ',', '-', ',', ',', '.', ',', ',', ',', ',', '.', '-', '[', '].', '(', ')', '(', ':', ').', '(', '.', ')', '[', '].', '.', ',', ':', '(', ')', ',', '(', ':', ').', '(', '.:', ').', ',', '(', '-', '),', '[', '].', '-', '(', ')', '(', ':', '-', '-', ';', '.', ').', ',', ',', ',', '[', '].', '.', '[', ']:', ',', ',', '(', '.', '.', ')', '.', ',', ',', '-', '.', ',', '.', ',', ',', '[', '].', ',', '-', '.', '(', ':', ')', '-', '(', ',', '),', '-', '(', '),', '-', '(', ')', '.', '-', '-', ',', '-', '.', '(', ')', ',', '-', '(', '.', '.', ',', ')', ',', '-', ',', '(', ').', '.', ',', ',', '-', '-', '[', '].', ',', ',', '[', '].', ',', ',', '-', ',', '-', '.', '[', ']:', ',', "'", "',", ',', ',', '.', '[', ']:', ',', "'", "'", '.', ',', ',', '(', ')', '(', '.).', "'", "'", "'", ',', '(', '.;', '.', ').', ',', ',', '-', '[', '].', '(', '.', ';', ')', '-', '(', ')', '(', ').', '(', ')', ',', '-', ',', '-', '.', '.', '(', ':', '),', '.', ',', ',', '-', '[', '].', ',', '(', ')', '[', '],', ',', ',', ',', ',', '[', '].', "'", "',", '(', '.).', ',', '(', '.', '.', ')', '(', '.).', '(', '.):', ',', '.', '.', ',', '(', ').', ',', '(', '.', ').', ',', '[', '].', ',', ',', ',', '(', '.', ').', '(', '.', ')', '(', '):', ',', '(', ',', '.)', '.', '.', ',', '-', '.', ',', '(', '.', ')', '[', '],', ',', '[', '].', ',', ',', '-', '(', ':', '-', ').', '(', '.).', ',', '[', '].', ',', '(', ':', '-', ').', '.', ',', ',', '-', '-', '(', '.).', '(', ',', ',', "'", "'-", ';', '[', '];', '.', ').', ',', ',', '[', ']', '.', ',', '.', '(', ')', '-', '.', ',', '-', ',', ',', '-', '[', '].', ',', ',', ',', '(', '.', '.', ').', '(', ')', ',', '(', '),', '(', ').', ',', ':', ',', ',', '.', ',', '.', '(', ',', '.', '.', '[', '])', ',', '-', ',', '.', '.', ',', ',', ',', '.', ',', ':', "'", "'", '(', ')', ',', '[', '].', ',', ',', '(', ',', '[', '])', '.', '-', ',', '.', ',', '-', ',', '-', '[', '].', ',', '(', ')', ',', '-', '.', ',', ',', ',', '(', ')', '-', '(', ':', ').', ',', ',', '-', '.', ',', '-', '.', ',', ',', ',', '.', '.', '"', ':', '."', ':', '.', '.', '.', '.', '.', '.', ':', ':', '.', '.', '-', '-', '-', '.', '.', ':', '.', ',', '.', '.', '"', ':', '."', ':', '.', '(', ').', '.', '.', '.', '.', ':', ':', '.', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '.', '?', '.', ':', '.', ',', ',', ',', ',', '.', '.', ',', ',', ',', '.', '.', '"', ',', '."', ':', '.', ',', '.', '-', '.', '.', ':', ':', '.', '-', '.', '.', ':', '.', ',', '.', ',', ',', '.', '.', '"', '."', ':', '.', '-', '.', ',', '.', '.', ':', ':', '.', '-', '.', '.', ':', '.', ',', '.', '.', '.', ':', '.', ',', ',', '.', '.', '.', ':', '-', '.', ',', ',', ',', '.', '.', '"', '."', ':', '.', ',', '.', '-', '.', '.', ':', ':', '.', '.', '_', '.', '.', ':', '.', ',', ',', '.', '.', '"', ':', '."', ':', '-', '.', ',', '.', '.', '.', ':', ':', '.', '-', '.', '.', ':', '.', ',', ',', '.', '.', '"', '."', ':', '.', ',', '.', '-', '.', '.', ':', ':', '.', '.', '-', '.', '.', ':', '.', ',', ',', '.', '.', '"', '-', '-', '."', ':', '.', ',', '.', '-', '.', '.', ':', ':', '.', '.', '_', '_', '_', '_', '.', '.', ':', '.', '[', ']', ':', '.', '.', ',', '.', '[', ']', ':', '.', '.', ',', '.', '[', ']', ':', '.', '.', '.', ',', '.', '[', ']', ':', '.', '.', '.', ',', '.', '[', ']', ':', '.', '.', '.', '?', ',', '.', '[', ']', ':', '.', '.', '-', '-', '-', ',', '.', '[', ']', ':', '.', '.', '-', '-', '-', ',', '.', '[', ']', ':', '.', '.', '.', '?', '_', '_', ',', '.', '[', ']', ':', '.', '.', '-', '-', ',', '.', '[', ']', ':', '.', '.', ',', '.', '[', ']', ':', '.', '.', ',', '.', '[', ']', ':', '.', '.', '.', '?', '_', ',', '.', '[', ']', ':', '.', '.', '.', '?', ',', '.', '.', ',', '.', '.', '(', ':', '-', '-', ')', '-', ':', '(', ',', ',', ')', ',', '...', ',', '...', ',', '...', ',', '...', '.', '(', ')', ',', '(', ')', '.', ',', ',', '(', ':', ').', '.', '(', ')', ',', '(', ';', ':', '-', '):', '.', '.', '...', '...', '.', '.', '(', '[', ']', '[', '])', ':', ',', ',', '.', '(', ').', '?', '?', '(', ',', '...', ',', '.)', ',', '(', '.', '.', '?', ',', ').', '?', ',', '.', ':', ':', ',', ',', '.', '(', '[', ']', '[', '])', ':', ':', ':', ':', ':', '"', '"', ':', '(', ').', ':', ':', '"', '"', '.', ':', ':', '"', '"', ':', ':', '"', '"', ':', '(', ').', ':', ':', '"', '"', ':', ':', '"', '"', ':', ':', '"', '"', ':', ',', ':', '(', ',', '.).', ':', ':', '"', '"', ':', '"', '"', ':', '.', '.', ',', '(', ':', '-', ')', '.', ':', '?', ',', ':', ':', ':', '?', ',', ':', ':', '.', ':', '.', ':', '-', ',', '.', ':', ':', '-', '.', ':', '-', ':', '-', '-', ':', ':', ':', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', ':', ':', ':', ':', ':', ':', ':', "'", "',", ',', '.', '.', ',', ',', ',', '.', "'", "'", ',', ',', '.', '.', '(', ':', ').', ',', ',', ',', ',', ',', '(', ':', '-', ').', ',', '(', ')', '(', ';', '.', ').', ',', ',', '.', '(', ',', ')', ',', ',', ',', ',', ',', '(', ':', '-', ').', ',', ':', '-', ',', ',', ',', ',', ',', '-', '.', ',', '"', '"', ',', '.', ',', '"', '"', '.', '[...]', '(', ':', ')', '.', ',', '.', '(', ':', ')', ',', ',', '.', '(', ':', '-', ').', ',', '(', ':', '),', '.', '.', ',', '.', '(', ':', ')', ',', '(', ':', '-', ')', ',', ',', ',', ',', '.', ',', ',', ',', ',', "'", "'", '.', '(', '),', '.', ',', ',', '.', ',', '.', '.', "'", "',", ',', '(', '.', '.', ':', ',', ':', '),', '.', ',', "'", "'", '[', '].', "'", "'", '(', ':', '-', '),', ',', '(', ',', ').', ',', "'", "',", "'", "',", '(', ':', ').', '(', ':', '-', ')', "'", "'", "'", "'", "'", "':", ',', ',', "'", "'", ',', '.', '(', ':', '),', ',', ',', '-', ',', '(', ':', ').', '?', ',', ',', ',', ',', '(', ':', ').', ',', '(', ':', '),', ',', ',', '.', ',', ',', ',', ',', '(', ':', '-', ').', '.', ',', '(', '.', '.', ',', '),', '(', ':', ').', '(', ',', ',', ':', '-', ',', ':', ').', ',', ',', '.', ',', ',', ',', '.', ',', '(', ':', ')', '(', ':', ')', ',', ',', '.', ',', ',', ',', '(', ':', ').', ',', ',', ',', '(', ',', ',', ',', '.)', ',', ',', '(', ':', ').', ',', ',', '.', ',', '(', ':', ')', '[', '],', '.', ',', ',', ',', '(', ').', ',', '(', ':', ')', '.', '.', ',', '.', '(', ',', ',', ':', ',', ':', ')', ',', '(', ':', ').', ',', ',', '(', ':', ').', ',', ',', ',', ',', '(', ':', ',', ':', '-', ').', ',', ',', '(', ':', ')', ':', '.', ',', '.', ',', '(', ':', ',', ':', ').', '(', ':', ')', '(', ':', ').', ',', ',', '(', ':', '),', ',', '(', ':', ').', ',', '-', "'", "'", '.', '.', ',', ',', ',', ',', ':', '.', ':', ',', '(', ':', '-', '),', ',', '.', '(', ':', ')', ',', '[', '],', '[', '],', '.', ',', '(', ')', ',', ',', '.', ',', '.', ',', "'", "'", ',', '(', '):', ':', '(', ')', ',', '[', '].', ':', ',', '.', ':', ',', ',', '(', ')', ',', ',', ',', ',', ',', ',', '.', ',', '(', ',', ',', ':', ',', ').', ',', '(', '-', '-', ')', '[', '-', ']', '.', '(', ';', ')', '(', '),', '.', ',', ',', '.', ',', '.', '[', ']', '-', '[', '].', ',', '[', '],', '.', '[', ']', '(', '(', ':', ')', '),', '.', ',', ',', '(', ':', '),', ',', ',', '.', ',', ',', '.', '.', ':', ',', ',', ',', '(', '-', ':', ').', '.', ',', ',', ',', '.', ',', '(', ':', ')', ',', ',', ',', '-', ',', '.', ',', ',', ',', ',', '.', '-', '-', ',', '-', '-', '-', ',', '(', '.', '),', ':', '-', '-', ',', ',', ',', ',', '.,', '.', ',', '[', ']', ',', ',', ',', '.', '.', '[', '],', ',', '.', '-', '(', '.', '),', ',', ',', ',', ',', '.,', '.', '.', ',', '[', ']', ',', ',', ',', '(', ',', '[', ']).', ',', ',', '.', ',', "'", "'", ',', ':', ':', '?', ':', '?', ',', '?', ':', '?', '(', ')', '?', '?', ':', ',', '?', '.', '.', '.', ':', ',', ',', '(', '.', '),', ',', '(', '.', '),', ',', ',', "'", "'", ',', ',', '(', '.', ').', ',', '(', ')', ',', '.', ',', ',', '-,', '[', '].', '.', '.', ',', ':', '(', ':', ')', '(', ')', '.', ',', ':', '(', '),', ',', '.', ',', '(', '.', ')', ',', ',', ',', '(', '.', ')', ',', ',', '(', '.', ')', '.', ',', ',', ',', '.', ',', ':', ',', ',', ',', '?', '(', ')', '.', ',', ',', '.,', '.', '-.', ',', ',', '-', '-', '-', '(', '),', ',', ',', ',', ',', '.', ',', '.', ',', '.', ',', '-', '-', ',', '(', '.', '.', ',', ').', ',', ',', ',', '.', '(', '.', '.', '),', '.', '(', '),', '.', ',', '.', '.', ',', '.', ',', '.', '.', '.', ',', '.', ',', ',', ',', ',', '(', ',', ').', '(', ')', ',', ',', ',', '.', ',', ',', ',', '.', ',', ',', ',', '(', ':', ').', ',', '.', ':', '.', '(', ')', ',', "'", "'", "'", "':", "'", '[', ']', '.', '[', ']', '.', '.', ':', '-', '.', '-', '.', '-', "'", '[', '].', '.', ',', ':', '[', '].', ',', ',', ',', '.', ',', '(', ',', ',', ',', ').', ',', '("', "'", '",', ':', ')', '.', ',', ',', '-', ',', "'", "'", "'", "'.", '.', ',', '.', '(', ')', '.', ',', ',', '.', '(', '.', '.', ')', '.', ',', '.', '.', '.', "'", "'", ',', '.', ',', ',', ',', '(', ',', ',', ',', '.)', '.', ',', ',', '.', '(', ').', ',', ',', ',', ',', '.', ',', ',', '(', ')', '.', '(', '.', '.', ',', ',', ',', ',', '.).', ',', '.', ',', '(', ',', ')', '(', ')', '.', ',', '-', ',', '.', '[', ']', '.', ',', '.', ',', ',', ',', ',', '.', ',', '-', ',', '.', ',', ',', ',', ',', '.', ',', ',', '.', ',', ',', ',', '.', "'", "'", ':', '-', ',', '.', ',', '(', ',', ')', ',', '.', ',', ',', '.', ',', '.', ',', ',', ',', ',', '.', '.', ',', '(', '.', '.', '),', ',', '.', ',', ',', '.', ',', ',', '.', ',', '.', '.', '"', ':', '."', ',', ',', '.', '(', '.):', "'", ':', '.', ':', '.', '.', '.', ',', '.', '.', '"', '."', ':', '.', '(', ').', '-', '.', '.', '.', '.', ':', ':', '.', '.', '.', ':', '.', ',', ',', ',', ',', ',', '.', '.', '.', ':', '.', '.', ',', '.', '.', ':', ':', '.', '.', '.', '.', ':', '.', ',', '.', '.', ':', "'", "'.", ':', '.', ',', '.', '.', '.', '"', "'", "'", ':', '."', ':', '.', '(', ').', '.', '.', '.', '.', ':', ':', '.', '.', '?', '_', '.', '.', '.', ':', '.', ',', '-', ',', '.', '.', '"', '?', '."', ':', '.', '(', ')', '.', '.', '.', '.', ':', ':', '.', '.', '.', ':', '.', ',', '.', ',', '.', '.', ':', '.', ',', ':', '.', ',', '.', '.', '"', '."', ',', '.', ',', ',', ',', '.', '(', '.):', ':', '.', ':', '-', '.', '.', '.', ',', '.', '.', ':', '.', ':', '.', ',', '.', '.', ':', '.', ':', '.', '[', ']', ':', '.', '.', '.', '.', '.', '-', ',', '.', '[', ']', ':', '.', '.', '.', ',', '.', '[', ']', ':', '.', '.', '.', '.', '.', ',', '.', '[', ']', ':', '.', '.', ',', '.', '[', ']', ':', '.', '.', ',', '.', '[', ']', ':', '.', '-', '.', ',', '.', '[', ']', ':', '.', '.', '.', '.', '.', '-', ',', '.', '[', ']', ':', '.', '.', '.', ',', '.', '[', ']', ':', '.', '.', '.', ',', '.', '[', ']', ':', '.', '.', '.', ',', '.', '[', ']', ':', '.', '.', ',', '.', '.', ':', ',', '(', '):', '(', ')', ':', ',', '"', '"', '[', ']', '"', '"', '"', '"', '.', ',', ',', '"', '"', '-', '"', '"', ',', '.', '?', '.', ',', ',', '(', ':', ')', '____________________________________________________________________', '...', ':', '.', '"', '".', '"', '"', '.', '"', '".', '"', '"', '(', ',', ')', '.', '"', '"', ':', '"', '-', '",', '"', '-', '",', '"', '-', '"', '"', '-', '".', ',', '.', '.', ',', ',', '(', ':', ')', '____________________________________________________________________', ':', '-', ',', ',', ',', ',', ',', '-', '.', ',', '"', '"', ',', '.', ',', '"', '"', '.', '"', '"', ',', ';', ',', '"', '."', ',', '"', '"', ',', ',', ',', ',', '-', ',', ',', '"', "'", '."', '"', '"', ',', ',', '.', ',', "'", '.', ',', ',', ',', '(', ':', ')', ':', '(', ')', ':', '-', '-', '(', ')', ':', ',', ',', "'", ',', ',', '-', ':', ':', ',', '(', '[', '])', '____________________________________________________________________', '-', '-', ',', ',', '[', ']:', '-', '.', '(', '[', '])', '____________________________________________________________________', '-', '-', ',', '-', ':', ',', ',', '-:', ':', ':', '-', ':', ':', '-', '-:', '-', '-', '-:', ',', ',', ',', '(', '-', '[', '])', '____________________________________________________________________', '-', '((', '-', ')', ')', ':', "'", '-', '.', ':', '-', '.', "'", "',", '.', '(', '[', '])', ':', ':', '(', ',', ')', '(', '.', '.', ')', '(', '.', ')', '(', '.', '.', ')', '(', '.', '.', ')', '(', '.', '.', ')', '-', '(', ',', ')', '(', '.', '.', ')', '(', '.', '.', ')', '(', '.', ')', '(', '.', ')', '(', '.', ')', '(', ',', ')', '(', '.', '.', ')', '(', '.', '.', ')', '(', ')', '(', '.', '.', ')', '(', '.', ')', ':', '.', ',', '.', '.', ',', ',', '[', '].', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', '(', ')', ':', '(', ',', ')', '(', '.', ')', '(', '.', '.', ')', '(', '.', '.', ')', '(', '.', '.', ')', '(', '.', '.', ')', ',', '.', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', '),', '(', ')', ':', '-', '-', '[', ']:', '-', '.', ';', ',', ':', ':', ',', '.', ',', ':', ',', '.', ',', ',', ',', ',', ':', ',', '-', ',', ',', '-', '.', ',', ',', ':', ',', '.', ',', ':', ',', '.', ',', ':', ',', '-', '.', '(', '.', '.', ')', ':', ',', '-', '____________________________________________________________________', '-', '.', ',', ':', ',', '.', ':', '.', ',', '[', ']', ':', ',', ',', ',', '.', ',', '.', '.', ',', ',', ',', '.', '(', ',', ',', '.)', '.', '.', ',']

In [25]:
# b
pattern = r'''(?x)
    (?:\d+\.)?\d+\s?\$                  # Monetary amount like 2.40$
    | \$\s?(?:\d+\.)?\d+                # Monetary amount like $2.40
    | \d{4}\-\d{2}\-\d{2}               # Date like 2016-22-01
    | \d{1,2}\s[A-Z][a-z]{2,8}\s\d{4}   # Date like 2 March 1998
    | [A-Z][a-z]+(?:\s[A-Z][a-z]+)?     # Proper Names - TODO: don't match beginning of sentence
'''
testString = 'should match 3.50$ or 8 $ or 9$ or $2.40 or 2016-11-01 or 2 March 1998 or 19 January 2001 or Sam or United Nations'
print nltk.regexp_tokenize(testString, pattern)


['3.50$', '8 $', '9$', '$2.40', '2016-11-01', '2 March 1998', '19 January 2001', 'Sam', 'United Nations']

In [26]:
print nltk.regexp_tokenize(corpusText, pattern)


['Web', 'Based', 'Assessment Beyond', 'Multiple', 'Choice', 'The Application', 'Technologies', 'Different Testing', 'Formats', 'Documentation', 'Master', 'Thesis', 'Linguistics', 'Web Technology', 'Faculty', 'Foreign Languages', 'Cultures', 'Philipps', 'Universit', 'Marburg', 'Julia Neumann', 'Naumburg', 'Germany', 'Marburg', 'Contents', 'List', 'Abbreviations', 'Introduction', 'User Guide', 'Overall Organization', 'Code', 'General Design', 'Java', 'Script Components', 'Implementation', 'Testing Formats', 'Crossword', 'Dynamic Multiple', 'Choice', 'Drag', 'Drop', 'Database Structure', 'General Features', 'Index Page', 'Contact Page', 'Color Changer', 'Inline Editing', 'Deletion', 'Exporting Tests', 'References', 'Appendix', 'Database Structure', 'Declaration', 'Authorship', 'List', 'Abbreviations', 'Asynchronous Java', 'Script', 'Cascading Style', 'Sheets', 'Document Object', 'Model', 'Hypertext Markup', 'Language', 'Joint Photographic', 'Experts Group', 'Model', 'View', 'Controller\nMy', 'Li', 'My', 'Improved', 'Hypertext Preprocessor', 'Portable Network', 'Graphics', 'Structured Query', 'Language', 'Scalable Vector', 'Graphics', 'Extensible Markup', 'Language', 'Introduction\nThis', 'The', 'Thus', 'These', 'Users', 'Modern', 'While', 'Java', 'Script', 'The', 'Moreover', 'Before', 'User Guide', 'The', 'This', 'Hovering', 'Apart', 'They', 'The', 'By', 'Clicking', 'To', 'Therefore', 'Instead', 'Home', 'Clicking', 'The', 'Short', 'In', 'Elements', 'This', 'Pressing', 'Enter', 'Elements', 'Clicking', 'When', 'Furthermore', 'Running', 'The', 'Check', 'While', 'Overall Organization', 'Code\nThe', 'Java', 'Script', 'The', 'Furthermore', 'On', 'Its', 'My', 'Li', 'Objects', 'Furthermore', 'Lastly', 'It', 'Another', 'These', 'The', 'Java', 'Script', 'Java', 'Script', 'Aside', 'For', 'These', 'Java', 'Script', 'The', 'The', 'Java', 'Script', 'These', 'Furthermore', 'This', 'Java', 'Script', 'Finally', 'Apart', 'This', 'Java', 'Script', 'It', 'Java', 'Script', 'As', 'Java', 'Script', 'Java', 'Script', 'Therefore', 'Java', 'Script', 'General Design', 'Java', 'Script Components', 'As', 'Java', 'Script', 'Model', 'View', 'Controller', 'The', 'Combining', 'Java', 'Script', 'The', 'Java', 'Script', 'Test', 'Test', 'Item', 'View', 'Control', 'These', 'View', 'In', 'Java', 'Script', 'Test', 'View', 'Control', 'The', 'Test', 'Item', 'Question', 'Item', 'Container', 'These', 'While', 'View', 'When', 'The', 'Whenever', 'Implementation', 'Testing Formats', 'While', 'Java', 'Script', 'Crossword\nFor', 'Answers', 'English', 'Once', 'The', 'Java', 'Script', 'Thus', 'The', 'Given', 'Then', 'For', 'If', 'Once', 'This', 'If', 'The', 'Thus', 'First', 'This', 'Second', 'Should', 'The', 'As', 'Web Worker', 'This', 'Before', 'Web Workers', 'In', 'Once', 'Java', 'Script', 'When', 'Apart', 'Simply', 'An', 'Upon', 'Dynamic Multiple', 'Choice\nWhile', 'As', 'Thus', 'This', 'As', 'By', 'It', 'For', 'In', 'As', 'For', 'Here', 'Simple', 'Apart', 'As', 'Java', 'Script', 'This', 'Lastly', 'Thus', 'When', 'This', 'Drag', 'Drop\nThe', 'While', 'The', 'The', 'Thus', 'Once', 'Elements', 'Dragenter', 'When', 'The', 'Finally', 'The', 'In', 'This', 'Another', 'Learners', 'In', 'Java', 'Script', 'Database Structure', 'Complementing', 'My', 'For', 'Appendix', 'The', 'On', 'The', 'This', 'The', 'Ds', 'Similarly', 'Ds', 'This', 'On', 'The', 'Each', 'Item', 'The', 'As', 'The', 'Thus', 'Lastly', 'The', 'This', 'Strictly', 'Storing', 'It', 'Therefore', 'General Features', 'Apart', 'The', 'Index Page', 'The', 'This', 'The', 'More', 'Assuming', 'The', 'The', 'As', 'Therefore', 'First', 'Second', 'Java', 'Script', 'This', 'The', 'Contact Page', 'There', 'The', 'Apart', 'Internet', 'As', 'Color Changer', 'In', 'This', 'This', 'Changing', 'Java', 'Script', 'In', 'As', 'This', 'Local Storage', 'Each', 'Whenever', 'If', 'This', 'Inline Editing', 'Deletion\nAnother', 'While', 'Displaying', 'Therefore', 'For', 'The', 'View', 'Thus', 'When', 'Also', 'Following', 'These', 'As', 'Exporting Tests', 'Administrators', 'As', 'The', 'Basically', 'When', 'Download', 'Download', 'Print Test', 'Object', 'This', 'Next', 'Blob', 'This Blob', 'As', 'Object', 'Blob', 'The', 'If', 'Once', 'If', 'Data', 'This', 'In', 'The', 'It', 'First', 'Internet Explorer', 'Object', 'The', 'Second', 'Data', 'Blob', 'Chrome', 'Opera', 'Thus', 'Firefox', 'Printing', 'Chrome', 'Opera', 'Firefox', 'Again', 'As', 'This', 'Local Storage', 'Web Workers', 'Web', 'References\nInternet', 'Sources', '06 May 2015', '06 May 2015', '06 May 2015', '11 May 2015', '07 May 2015', '07 May 2015', '11 May 2015', 'Web', 'Element', 'Object', '12 May 2015', '11 May 2015', 'Further Internet', 'Sources', '07 May 2015', '07 May 2015', '07 May 2015', '07 May 2015', 'Appendix', 'Database Structure', 'Declaration', 'Authorship', 'Master', 'Thesis', 'Master', 'Arts', 'Web', 'Based', 'Assessment Beyond', 'Multiple', 'Choice', 'The Application', 'Technologies', 'Different Testing', 'Formats', 'No', 'All', 'The', 'Internet', 'Marburg', '19 May 2015', 'Markup Languages', 'Human Language', 'Technologies', 'Three Examples', 'Julia Neumann', 'Paper', 'Human Language', 'Technologies', 'Winter Term', 'Submission Date', '17 December 2013', 'Approved', 'Dr', 'Peter Franke', 'Philipps University', 'Marburg\nContents', 'List', 'Abbreviations', 'Introduction', 'Overview', 'Markup Language', 'Advantages', 'Applications', 'Languages', 'Conclusions', 'References', 'Appendix', 'Example', 'Example', 'Example', 'Example', 'List', 'Abbreviations', 'Artificial Intelligence', 'Markup Language', 'Document Type', 'Definition', 'Human Language', 'Technologies', 'Hypertext Markup', 'Language', 'Natural Language', 'Processing', 'Web Ontology', 'Language', 'Standard Generalized', 'Markup Language', 'World Wide', 'Web Consortium', 'Extensible Markup', 'Language', 'Path', 'Path Language', 'Extensible Stylesheet', 'Language Transformations', 'Introduction', 'Overview', 'The', 'In', 'Furthermore', 'But', 'Extensible Markup', 'Language', 'The', 'Lobin', 'Its', 'Markup Language', 'In', 'Markup', 'Lobin', 'However', 'This', 'Lobin', 'More', 'Advantages', 'Applications', 'The', 'Firstly', 'Secondly', 'Moreover', 'These', 'As', 'Schwartzbach', 'Languages', 'Considering', 'Artificial Intelligence', 'Markup Language', 'Web Ontology', 'Language', 'Extensible Stylesheet', 'Transformations', 'Our', 'It', 'The', 'Each', 'These', 'For', 'Furthermore', 'Other', 'These', 'Contemporary', 'Fialho', 'Silvervarg', 'As Bii', 'While', 'Silvervarg', 'It', 'Semantic Web', 'Web', 'Such', 'In', 'The', 'The', 'Class', 'By', 'To', 'Web', 'In', 'Androutsopoulos', 'Another', 'Sateli', 'Semantic Assistants', 'Witte', 'Gitzinger', 'Here', 'After', 'Being', 'Lobin', 'These', 'In', 'More', 'Lobin', 'The', 'This', 'Path', 'For', 'Gill', 'Similarly', 'Of', 'Sch', 'Weitz', 'Conclusions\nThe', 'Of', 'Using', 'As', 'Still', 'Other', 'Other', 'Furthermore', 'However', 'Moreover', 'Path', 'Still', 'Lobin', 'Although', 'Due', 'References\nAndroutsopoulos', 'Ion', 'Lampouras', 'Gerasimos', 'Galanis', 'Dimitros', 'Generating Natural', 'Language Descriptions', 'Ontologies', 'Natural', 'System', 'In', 'Journal', 'Artificial Intelligence', 'Research', 'November', 'Available Online', 'Last', '30 November 2013', 'Bii', 'Patrick', 'Chatbot Technology', 'Possible Means', 'Unlocking Student', 'Potential', 'Learn How', 'Learn', 'In', 'Educational Research', 'Februar', 'Available Online', 'Last', '30 November 2013', 'Fialho', 'Pedro', 'Coheur', 'Lu', 'Curto', 'Costa', 'Pedro', 'Abad', 'Alberto', 'Meinedo', 'Hugo', 'Trancoso', 'Isabel', 'Meet', 'In', 'Proceedings', 'Annual Meeting', 'Association', 'Computational Linguistics', 'Sofia', 'Bulgaria', '9 August 2013', 'Available Online', 'Last', '28 November 2013', 'Gill', 'Alastair', 'Brockmann', 'Carsten', 'Oberlander', 'Jon', 'Perceptions', 'Alignment', 'Personality', 'Generated Dialogue', 'In', 'Proceedings', 'International Natural', 'Language Generation', 'Conference', 'May', '1 June 2012', 'Utica', 'Available Online', 'Last', '29 November 2013', 'Lobin', 'Henning', 'Computerlinguistik', 'Texttechnologie', 'Paderborn', 'Fink', 'Anders', 'Schwartzbach', 'Michael', 'An Introduction', 'Web Technologies', 'Harlow', 'Addison', 'Wesley', 'Sateli', 'Bahar', 'Cook', 'Gina', 'Witte', 'Ren', 'Smarter Mobile', 'Apps', 'Integrated Natural', 'Language Processing', 'Services', 'In', 'Proceedings', 'International Conference', 'Mobile Web', 'Information Systems', 'Paphos', 'Cyprus', '28 August 2013', 'Available Online', 'Last', '30 November 2013', 'Sch', 'Ulrich', 'Weitz', 'Benjamin', 'Combining', 'Outputs', 'Logical Document', 'Structure Markup', 'Technical Background', 'Contributed Task', 'In', 'Proceedings', 'Special Workshop', 'Rediscovering', 'Years', 'Discoveries', 'Jeju', 'Republic', 'Korea', '10 July 2012', 'Available Online', 'Last', '30 November 2013', 'Silvervarg', 'Annika', 'Arne', 'Iterative Development', 'Evaluation', 'Social Conversational', 'Agent', 'In', 'Proceedings', 'International Joint', 'Conference', 'Natural Language', 'Processing', 'Nagoya', 'Japan', '18 October 2013', 'Available Online', 'Last', '28 November 2013', 'Witte', 'Ren', 'Gitzinger', 'Thomes', 'Semantic Assistants', 'User', 'Centric Natural', 'Language Processing', 'Services', 'Desktop Clients', 'In', 'Proceedings', 'Asian Semantic', 'Web Conference', 'Bangkok', 'Thailand', '11 December 2008', 'Available', 'Online', 'Last', '30 November 2013', 'Internet Sources', '21 November 2013', '23 November 2013', '28 November 2013', '28 November 2013', '28 November 2013', '30 November 2013', '30 November 2013', 'Web', 'Ontology', 'Language', '30 November 2013', '29 November 2013', '29 November 2013', '30 November 2013', 'Procedural', '6 December 2013', '7 December 2013', 'Appendix\nThis', 'It', 'Example\nThe', 'Lobin', 'In', 'Within', 'Lobin', 'The', 'Lobin', 'Example', 'This', 'When', 'This', 'Do', 'Star Wars', 'No', 'The', 'Which', 'Why', 'Star Wars', 'The', 'The', 'It', 'Example\nThe', 'Class', 'Class', 'Of', 'Class', 'Thing', 'There', 'Thing', 'Thing', 'The', 'Object', 'Property', 'Color', 'Object', 'Property', 'Here', 'It', 'Color', 'Thing', 'Color', 'Thing', 'The', 'Example\nIn', 'Lobin', 'The', 'Hi', 'How', 'In', 'User', 'Chatbot', 'Hi', 'How', 'For', 'This', 'User', 'Parts', 'The', 'Path', 'Chatbot', 'The', 'An', 'Uber', 'Exceptional', 'Perspective', 'Borrowing', 'Corpus', 'Based Case', 'Study', 'German Loan', 'Morpheme', 'Present', 'Day English', 'Julia Neumann', 'Paper', 'The New', 'Media', 'Linguistics', 'Corpus Linguistics', 'Summer Term', 'Submission Date', '19 September 2014', 'Approved', 'Prof', 'Dr', 'Rolf Kreyer', 'Philipps University', 'Marburg\nContents', 'Abstract', 'Introduction', 'Theoretical', 'Empirical Background', 'General Aspects', 'Terminology', 'Adaptation', 'Loans', 'German Loans', 'English', 'Corpus Study', 'Example', 'German Loan', 'Morpheme', 'Choice', 'Corpus', 'Corpus Research', 'Orthography', 'Grammar', 'Semantics', 'Usage', 'Comparison', 'Donor Language', 'Conclusions', 'References', 'Appendix', 'Speaker Intuitions', 'Appendix', 'English Dictionary', 'Information', 'Appendix', 'Research Results', 'Table', 'Different Word', 'Classes', 'English', 'Table', 'Words Used', 'Most Commonly', 'English', 'Table', 'Different Word', 'Classes', 'German', 'List', 'Items Used', 'Most Commonly', 'German', 'Appendix', 'German Dictionary', 'Information', 'Confirmation', 'Authorship', 'Abstract\nLoan', 'The', 'For', 'German', 'English', 'Ten', 'Ten', 'The', 'German', 'Introduction\nOne', 'Haugen', 'While', 'English', 'German', 'German', 'English', 'Stanforth', 'However', 'German', 'German', 'English', 'Limbach', 'German', 'One', 'The', 'English', 'American', 'British English', 'English', 'Limbach', 'Thus', 'The', 'American', 'Recently', 'Relieved', 'American English', 'Limbach', 'One', 'In', 'As Stanforth', 'English', 'German', 'His', 'German', 'English', 'Stanforth', 'Accordingly', 'German', 'Pfeffer', 'Cannon', 'German', 'English', 'It', 'German', 'English', 'Such', 'Meier', 'Stubbs', 'German', 'Angst', 'English', 'German', 'Assuming', 'English', 'Corpus', 'German', 'However', 'Theoretical', 'Empirical Background', 'The', 'General Aspects', 'Terminology\nWhile', 'Haspelmath', 'Haugen', 'In', 'Concise Oxford', 'Dictionary', 'Linguistics', 'Although', 'Haugen', 'Furthermore', 'Haspelmath', 'Haugen', 'While', 'All', 'Haugen', 'Haspelmath', 'Why', 'There', 'Haspelmath', 'Thus', 'Meier', 'In', 'Stanforth', 'Adaptation', 'Loans\nNo', 'Haspelmath', 'Several', 'Pfeffer', 'Cannon', 'Haugen', 'However', 'First', 'For German', 'English', 'Meier', 'Pfeffer', 'Cannon', 'English', 'With', 'Haugen', 'It', 'German', 'English', 'English', 'German', 'Stanforth', 'As', 'With', 'Stubbs', 'German', 'English', 'German', 'One', 'Stanforth', 'In', 'Pfeffer', 'Cannon', 'German Loans', 'English\nWhile', 'German', 'English', 'German', 'English', 'It', 'Stubbs', 'Stanforth', 'English', 'Stubbs', 'Possible Reasons', 'English', 'Great Britain', 'United States', 'Stanforth', 'Thus', 'English', 'German', 'Stubbs', 'Pfeffer', 'Cannon', 'The', 'German', 'Stubbs', 'There', 'English', 'The', 'The', 'German', 'German', 'Stubbs', 'Stanforth', 'Estimates', 'German', 'English', 'Stanforth', 'Stubbs', 'In', 'German', 'English', 'Stanforth', 'English', 'Pfeffer', 'Cannon', 'Corpus Study', 'Based', 'German', 'Example', 'German Loan', 'Morpheme\nAs', 'German', 'English', 'German', 'This', 'First', 'Limbach', 'English', 'The', 'Stanforth', 'English', 'German', 'British National', 'Corpus', 'Second', 'Limbach', 'Furthermore', 'The', 'English', 'Appendix', 'The', 'The', 'English', 'Apart', 'Stubbs', 'Meier', 'For', 'English', 'All', 'Appendix', 'With', 'Additionally', 'Longman Dictionary', 'Contemporary English', 'Merriam', 'Webster Dictionary', 'Further', 'While', 'Oxford English', 'Dictionary', 'German', 'Limbach', 'German', 'Dictionary', 'Meier', 'An', 'Choice', 'Corpus\nThere', 'First', 'Onysko', 'Winter', 'Froemel', 'This', 'Second', 'English', 'Third', 'Stubbs', 'As', 'Ten', 'Ten', 'Web', 'This', 'Web', 'Jakub', 'It', 'Web', 'Furthermore', 'Sketch Engine', 'Ten', 'Ten', 'German', 'German', 'The', 'Ten', 'Ten', 'German', 'Another', 'Ten', 'Ten', 'Jakub', 'Corpus Research', 'For', 'Sketch Engine', 'Corpus Query', 'Language', 'As', 'Ten', 'Ten', 'German', 'The', 'English', 'Orthography', 'To', 'Grammar', 'Which', 'Is', 'Semantics', 'What', 'Which', 'What', 'Comparison', 'Which', 'German', 'The', 'Orthography\nThe', 'German', 'German', 'German', 'These', 'German', 'English', 'Nothing', 'English', 'German', 'Grammar\nAs', 'It', 'Stanforth', 'Thus', 'It', 'In', 'Consequently', 'Apart', 'Is', 'Table', 'Appendix', 'Taking', 'Thus', 'These', 'Also', 'Jennifer', 'There', 'Australia', 'What', 'Even', 'English', 'English', 'Second', 'This', 'Indeed', 'In', 'Rather', 'Semantics', 'Usage\nAs', 'English', 'As', 'Sketch Engine', 'Python', 'Table', 'Appendix', 'Even', 'This', 'Limbach', 'As', 'This', 'Rafael', 'Christmas', 'Perennial', 'Duke', 'Mike Kzryzewksi', 'He', 'So', 'Or', 'The', 'Thus', 'No', 'Rumors', 'Rowling', 'Harry Potter', 'Limbach', 'Additionally', 'As', 'In Stanforth', 'German', 'English', 'Nazi Germany', 'Thus', 'German', 'Germany', 'Nazi', 'Hitler', 'German', 'Only', 'This', 'English', 'Comparison', 'Donor Language', 'To', 'German', 'In', 'Ten', 'Ten', 'Table', 'Appendix', 'The', 'German', 'These', 'Only', 'Accordingly', 'Appendix', 'Python', 'As', 'German', 'English', 'The', 'Duden Online', 'Appendix', 'Duden', 'However', 'English', 'Rather', 'In', 'German', 'English', 'German', 'Conclusions\nThe', 'English', 'Its', 'German', 'Furthermore', 'German', 'These', 'English', 'Of', 'Diachronic', 'German', 'Investigating', 'Furthermore', 'In', 'It', 'English', 'References\nHaspelmath', 'Martin', 'Lexical', 'Concepts', 'In Haspelmath', 'Martin', 'Tadmor', 'Uri', 'Loanwords', 'World', 'Languages', 'Comparative Handbook', 'Berlin', 'Walter', 'Gruyter', 'Haugen', 'Einar', 'The Analysis', 'Linguistic Borrowing', 'In', 'Language', 'April', 'June', 'Available Online', 'Last', '5 September 2014', 'Jakub', 'Milo', 'Kilgarriff', 'Adam', 'Kov', 'Vojt', 'Rychl', 'Pavel', 'Suchomel', 'The Ten', 'Ten Corpus', 'Family', 'In', 'Proceedings', 'International Corpus', 'Linguistics Conference', '26 July 2013', 'Lancaster', 'Available Online', 'Corpora', 'Ten', 'Ten', 'Last', '10 September 2014', 'Limbach', 'Jutta', 'Ausgewanderte', 'Eine Auswahl', 'Beitr', 'Ausschreibung', 'Ausgewanderte', 'Ismaning', 'Hueber', 'Meier', 'The Status', 'Foreign Words', 'English', 'The Case', 'Eight German', 'Words', 'In', 'American Speech', 'Summer', 'Available Online', 'Last', '5 September 2014', 'Onysko', 'Alexander', 'Winter', 'Froemel', 'Esme', 'Necessary', 'Exploring', 'In', 'Journal', 'Pragmatics', 'Available Online', 'Last', '5 September 2014', 'Pfeffer', 'Alan', 'Cannon', 'Garland', 'German Loanwords', 'English', 'An Historical', 'Dictionary', 'Cambridge', 'New York', 'Cambridge University', 'Press', 'Stanforth', 'Anthony', 'Functional', 'Stylistic Aspects', 'German Loans', 'English', 'In Flood', 'John', 'Salmon', 'Paul', 'Sayce', 'Oliver', 'Wells', 'Christopher', 'Das', 'Band', 'Sprache', 'Studies', 'German Language', 'Linguistic History', 'Memory', 'Leslie Seiffert', 'Stuttgart', 'Hans', 'Dieter Heinz', 'Akademischer Verlag', 'Stanforth', 'Anthony', 'Deutsche Einfl', 'Wortschatz', 'Geschichte', 'Gegenwart', 'Beitrag', 'Amerikanischen Englisch', 'Eichhoff', 'Niemeyer', 'Stubbs', 'Michael', 'Words', 'Phrases', 'Corpus Studies', 'Lexical Semantics', 'Oxford', 'Blackwell', 'Internet Sources', '9 September 2014', '10 September 2014', '10 September 2014', '8 September 2014', '8 September 2014', '8 September 2014', '8 September 2014', '10 September 2014', 'Corpora', 'Ten', 'Ten', '10 September 2014', 'Sk', 'Corpus', 'Querying', '15 September 2014', 'September', 'Appendix', 'Speaker Intuitions', 'Speaker', 'English', 'Limbach', 'Englisch', 'In', 'Jugendsprache', 'Steigerungsform', 'Alle', 'Klassen', 'Jugend Gro', 'Wort', 'Sprache', 'Umlaute', 'Nutzen', 'Wie', 'Great Britain', 'Das', 'Befragten', 'Christian Fuchs', 'Berlin', 'Deutschland', 'Limbach', 'Britisches Englisch', 'Beim Zeitunglesen', 'Grossbritannien', 'Woerter', 'Fuer', 'Gro', 'Meist', 'Hin', 'Wort', 'Uberflieger', 'Bedeutung', 'Weg', 'Presse', 'Vor', 'Zusammensetzung', 'Dadurch', 'Bedeutung', 'Wortes', 'Gegenteil', 'Je', 'Zusammenhang', 'Angelika Mohr', 'London', 'Gro', 'Limbach', 'Amerkanisches Englisch', 'The', 'American', 'Recently', 'Relieved', 'American English', 'The', 'That', 'Additionally', 'Rumors', 'Rowling', 'Harry Potter', 'The', 'American', 'American', 'Nazi', 'Aryan', 'This American', 'Nietzschean', 'Robert Keeley', 'Worcester', 'Massachusetts', 'Limbach', 'Appendix', 'English Dictionary', 'Information\nEntries', 'English', 'Definitions', 'America', 'Bill Gates', 'Bohemians\nWord', 'Origin', 'German', 'Collins English', 'Dictionary', 'Pamela Lee', 'Longman Dictionary', 'Contemporary English', 'Full Definition', 'Variants', 'Origin', 'German', 'Old High', 'German', 'Merriam', 'Webster Dictionary', 'German', 'Oxford Dictionary', 'English', 'Appendix', 'Research Results', 'Table', 'Different Word', 'Classes', 'English', 'Table', 'Words Used', 'Most Commonly', 'English\nThe', 'Items', 'The', 'Connotations', 'Oxford Dictionary', 'English', 'Table', 'Different Word', 'Classes', 'German', 'List', 'Items Used', 'Most Commonly', 'German\nThe', 'Appendix', 'German Dictionary', 'Information\nEntry', 'Duden', 'Bildungen', 'Adjektiven', 'Verst', 'Gebrauch', 'Beispiel', 'Bildungen', 'Adjektiven', 'Eigenschaft', 'Beispiel', 'Bildungen', 'Adjektiven', 'Verben', 'Ma', 'Beispiele', 'Bildungen', 'Verben', 'Sache', 'Beispiel', 'Bildungen', 'Verben', 'Bedecken', 'Sicherstrecken', 'Beispiel', 'Bildungen', 'Substantiven', 'Endung', 'Sache', 'Oberseite', 'Beispiel', 'Bildungen', 'Verben', 'Wechseln', 'Stelle', 'Beispiel', 'Bildungen', 'Substantiven', 'Zuviel', 'Beispiel', 'Bildungen', 'Substantiven', 'Beispiel', 'Bildungen', 'Substantiven', 'Figur', 'Sache\nBeispiel', 'Confirmation', 'Authorship', 'All', 'The', 'Internet', 'Violation', 'Marburg', '19 September 2014']

Exercise 10)


In [27]:
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
print [(w, len(w)) for w in sent]


[('The', 3), ('dog', 3), ('gave', 4), ('John', 4), ('the', 3), ('newspaper', 9)]

Exercise 11)


In [28]:
raw = 'Tres tristes tigres comen trigo en un trigal.'
raw.split('t')


Out[28]:
['Tres ', 'ris', 'es ', 'igres comen ', 'rigo en un ', 'rigal.']

Exercise 12)


In [29]:
for char in raw[:10]:
    print char


T
r
e
s
 
t
r
i
s
t

Exercise 13)


In [30]:
raw.split()


Out[30]:
['Tres', 'tristes', 'tigres', 'comen', 'trigo', 'en', 'un', 'trigal.']

In [31]:
raw.split(' ')


Out[31]:
['Tres', 'tristes', 'tigres', 'comen', 'trigo', 'en', 'un', 'trigal.']

In [32]:
sent = 'Tres\ttristes\ttigres\tcomen\ttrigo\ten\tun\ttrigal.'
sent.split()


Out[32]:
['Tres', 'tristes', 'tigres', 'comen', 'trigo', 'en', 'un', 'trigal.']

In [33]:
sent.split(' ')


Out[33]:
['Tres\ttristes\ttigres\tcomen\ttrigo\ten\tun\ttrigal.']

In [34]:
sent = 'Tres   tristes   tigres   comen   trigo   en   un   trigal.'
sent.split()


Out[34]:
['Tres', 'tristes', 'tigres', 'comen', 'trigo', 'en', 'un', 'trigal.']

In [35]:
sent.split(' ')


Out[35]:
['Tres',
 '',
 '',
 'tristes',
 '',
 '',
 'tigres',
 '',
 '',
 'comen',
 '',
 '',
 'trigo',
 '',
 '',
 'en',
 '',
 '',
 'un',
 '',
 '',
 'trigal.']

In [36]:
sent = 'Tres \ttristes\t\t\ttigres\t\t comen\t \t trigo en un trigal.'
sent.split()


Out[36]:
['Tres', 'tristes', 'tigres', 'comen', 'trigo', 'en', 'un', 'trigal.']

In [37]:
sent.split(' ')


Out[37]:
['Tres',
 '\ttristes\t\t\ttigres\t\t',
 'comen\t',
 '\t',
 'trigo',
 'en',
 'un',
 'trigal.']

Exercise 14)


In [38]:
words = raw.split()
print words
words.sort()
print words


['Tres', 'tristes', 'tigres', 'comen', 'trigo', 'en', 'un', 'trigal.']
['Tres', 'comen', 'en', 'tigres', 'trigal.', 'trigo', 'tristes', 'un']

In [39]:
words = raw.split()
sorted(words)


Out[39]:
['Tres', 'comen', 'en', 'tigres', 'trigal.', 'trigo', 'tristes', 'un']

In [40]:
words


Out[40]:
['Tres', 'tristes', 'tigres', 'comen', 'trigo', 'en', 'un', 'trigal.']

In [4]:
# .sort() changes original list, sorted() returns new list

Exercise 15)


In [42]:
'3' * 7


Out[42]:
'3333333'

In [43]:
3 * 7


Out[43]:
21

In [44]:
int('3') * 7


Out[44]:
21

In [45]:
str(3) * 7


Out[45]:
'3333333'

Exercise 16)


In [5]:
montyTest



NameErrorTraceback (most recent call last)
<ipython-input-5-5317fa815d20> in <module>()
----> 1 montyTest

NameError: name 'montyTest' is not defined

In [47]:
from test import montyTest
montyTest


Out[47]:
'Monty Python'

In [48]:
import test
test.montyTest


Out[48]:
'Monty Python'

Exercise 17)


In [49]:
words = ['some', 'superexcitingly', 'long', 'example', 'words']
for w in words:
    print '%6s' % w,


  some superexcitingly   long example  words

In [50]:
for w in words:
    print '%-6s' % w,


some   superexcitingly long   example words 

In [51]:
for w in words:
    print '%6s' % w


  some
superexcitingly
  long
example
 words

Exercise 18)


In [52]:
myCorpus = load('corpus')
tokens = nltk.wordpunct_tokenize(myCorpus)

In [53]:
whWords = [w for w in tokens if w.startswith('wh') or w.startswith('Wh')]
print whWords[:50]


['who', 'While', 'which', 'which', 'which', 'whether', 'which', 'whether', 'which', 'whereas', 'when', 'where', 'when', 'When', 'which', 'While', 'which', 'which', 'which', 'which', 'which', 'who', 'which', 'while', 'which', 'where', 'while', 'which', 'while', 'While', 'When', 'Whenever', 'when', 'While', 'white', 'which', 'which', 'which', 'which', 'which', 'which', 'while', 'When', 'where', 'which', 'which', 'whole', 'While', 'which', 'where']

In [54]:
print sorted(set(whWords))


['What', 'When', 'Whenever', 'Which', 'While', 'Why', 'what', 'whatever', 'when', 'where', 'whereas', 'whether', 'which', 'while', 'white', 'who', 'whole', 'whose', 'why']

Exercise 19)


In [55]:
freqs = open('freqs.txt').readlines()
freqs


Out[55]:
['fuzzy 89\n', 'test 2312\n', 'foo 123\n', 'bar 34\n', 'baz 1']

In [56]:
splitted = [[line.split()[0], int(line.split()[1])] for line in freqs]
splitted


Out[56]:
[['fuzzy', 89], ['test', 2312], ['foo', 123], ['bar', 34], ['baz', 1]]

Exercise 20)


In [57]:
# extracts the topic of the article of the day of given Wikipedia Homepage
def find_topic(url, trigger):
    text = urllib.urlopen(url).read()
    index = text.rfind(trigger)
    text = text[index:]
    title_with_markup = re.findall(r'\<b\>.+?\<\/b\>', text)[0]
    soup = BeautifulSoup(title_with_markup)
    return soup.get_text()

# German Wikipedia:
print find_topic('https://de.wikipedia.org/wiki/Wikipedia:Hauptseite', '<span class="mw-headline" id="Artikel_des_Tages">Artikel des Tages</span>')

# English Wikipedia:
print find_topic('https://en.wikipedia.org/wiki/Main_Page', '<span class="mw-headline" id="From_today.27s_featured_article">From today\'s featured article</span>')

# Danish Wikipedia:
print find_topic('https://da.wikipedia.org/wiki/Forside', '<div style="padding-left: 38px; color:#333;">Ugens artikel</div>')


Maurice Harold Macmillan
Montreal Laboratory
Jan Palach

Exercise 21)


In [76]:
def unknown(url):
    content = getContentFromURL(url)
    lowercased = re.findall(r'[\s\(\[\{]([a-z]+)', content)
    words = nltk.corpus.words.words()
    return [w for w in lowercased if w not in words]
print unknown('https://en.wikipedia.org/wiki/Main_Page')


[u'mw', u'existing', u'mw', u'mw', u'articles', u'portals', u'scientists', u'escaped', u'joined', u'moved', u'arrived', u'became', u'negotiated', u'merged', u'moved', u'opened', u'reactors', u'larger', u'followed', u'fitz', u'rubiginosa', u'email', u'articles', u'occurred', u'became', u'youngest', u'goalscorer', u'published', u'disappeared', u'sprayed', u'recolonised', u'audiobooks', u'introducing', u'composers', u'parts', u'windows', u'improved', u'articles', u'dies', u'dies', u'songwriter', u'dies', u'crashes', u'including', u'members', u'adopts', u'settlements', u'forces', u'areas', u'deaths', u'knights', u'soldiers', u'captured', u'introduced', u'rules', u'players', u'called', u'elected', u'entitled', u'nanotechnology', u'tallest', u'completed', u'anniversaries', u'email', u'anniversaries', u'varieties', u'domesticated', u'carpio', u'purposes', u'ponds', u'gardens', u'tunggal', u'pictures', u'areas', u'projects', u'resources', u'activities', u'areas', u'questions', u'using', u'languages', u'librarians', u'volunteers', u'questions', u'subjects', u'updates', u'articles', u'releases', u'discussions', u'including', u'areas', u'issues', u'policies', u'projects', u'hosted', u'hosts', u'projects', u'software', u'coordination', u'textbooks', u'manuals', u'quotations', u'materials', u'activities', u'languages', u'contains', u'articles', u'largest', u'articles', u'articles', u'bokm', u'articles', u'nynorsk', u'categories', u'existing', u'tools', u'events', u'changes', u'changes', u'pages', u'projects', u'bokm', u'nynorsk', u'srpski', u'modified', u'terms', u'using', u'trademark', u'mw']

In [59]:
# derived forms, abbreviations, foreign words

Exercise 22)


In [75]:
print unknown('http://news.bbc.co.uk/')


set([u'things', u'alerts', u'says', u'sheds', u'hats', u'recycling', u'implants', u'sites', u'services', u'businesses', u'inbox', u'allowed', u'stories', u'oldest', u'birds', u'issues', u'barcodes', u'languages', u'changing', u'waits', u'tv', u'pictures', u'seekers', u'claiming', u'officers', u'comments', u'children', u'convicted', u'condemns', u'actors', u'helps', u'stars', u'internet', u'bosses', u'weeks', u'fraudster', u'aliens', u'clues', u'haveyoursay', u'strokes', u'dies', u'earlier', u'chiefs', u'allowing', u'courts', u'doctors', u'arriving', u'benefits', u'died', u'wrestled', u'hours', u'interpretations', u'turbans', u'remarks', u'items', u'newspapers', u'introduces', u'believes', u'choices', u'descended', u'temperatures', u'scores', u'offenders', u'minutes', u'finds'])

In [74]:
def unknown(url):
    text = urllib.urlopen(url).read()
    text = re.sub(r'\<script(?:.|\n)*?\<\/script\>', '', text)
    text = re.sub(r'\<style(?:.|\n)*?\<\/style\>', '', text)
    soup = BeautifulSoup(text)
    content = soup.get_text()
    lowercased = re.findall(r'[\s\(\[\{]([a-z]+)', content)
    words = nltk.corpus.words.words()
    return set([w for w in lowercased if w not in words])

print unknown('http://www.bbc.com/news')


set([u'senators', u'named', u'videos', u'resigns', u'hats', u'crimes', u'hairstyle', u'officers', u'arriving', u'retires', u'punched', u'farewells', u'allowed', u'cows', u'choices', u'birds', u'issues', u'languages', u'tv', u'alerts', u'girls', u'alleged', u'comments', u'children', u'parties', u'filming', u'buns', u'condemns', u'actors', u'helps', u'descended', u'memes', u'has', u'decades', u'aliens', u'shows', u'kicked', u'haveyoursay', u'seconds', u'dies', u'approves', u'earlier', u'grossing', u'robots', u'hours', u'allowing', u'sites', u'scores', u'remarks', u'died', u'hits', u'migrants', u'turbans', u'sanctions', u'charges', u'introduces', u'believes', u'overworked', u'steps', u'temperatures', u'shares', u'minutes', u'finds', u'launched', u'inbox'])

Exercise 23)


In [62]:
sample_text = "I don't hate regular expressions."
nltk.regexp_tokenize(sample_text, r'n\'t|\w+')


Out[62]:
['I', 'don', 't', 'hate', 'regular', 'expressions']

In [63]:
# doesn't work because of greediness of operators -> don matches \w+
print nltk.regexp_tokenize(sample_text, r'\w+(?=n\'t)|n\'t|\w+')
print nltk.regexp_tokenize('It doesn\'t split donald.', r'\w+(?=n\'t)|n\'t|\w+') # ?= lookahead assertion


['I', 'do', "n't", 'hate', 'regular', 'expressions']
['It', 'does', "n't", 'split', 'donald']

Exercise 24)


In [64]:
def encode(text):
    text = text.lower();
    trans = [('ate', '8'), ('e', '3'), ('i', '1'), ('o', '0'), ('l', '|'), ('s', '5'), ('\.', '5w33t!')]
    for (key, value) in trans:
        text = re.sub(key, value, text)
    return text

print encode('Hello World!')
print encode('It is getting late.')


h3||0 w0r|d!
1t 15 g3tt1ng |85w33t!

In [65]:
def encode_enhanced(text):
    text = text.lower();
    trans = [('ate', '8'), ('e', '3'), ('i', '1'), ('o', '0'), ('l', '|'), ('^s|(?<=\s)s', '$'), ('s', '5'), ('\.', '5w33t!')]
    #?<= lookbehind assertion
    for (key, value) in trans:
        text = re.sub(key, value, text)
    return text
encode_enhanced('Should treat sea different from ass.')


Out[65]:
'$h0u|d tr3at $3a d1ff3r3nt fr0m a555w33t!'

Exercise 25)


In [66]:
# a
def piginizeWord(word):
    cons = re.findall(r'^[^aeiouAEIOU]*', word)
    return word[len(cons[0]):] + cons[0] + 'ay'
    
piginizeWord('string')


Out[66]:
'ingstray'

In [67]:
# b
def piginizeText(text):
    def helper(matchObj):
        return piginizeWord(matchObj.group(0))
    return re.sub(r'[A-Za-z]+', helper, text)
piginizeText('Some quiet string here that should be converted to Pig Latin at once.')


Out[67]:
'omeSay uietqay ingstray erehay atthay ouldshay ebay onvertedcay otay igPay atinLay atay onceay.'

In [68]:
# c
def piginizeWordImproved(word):
    cons = re.findall(r'^[^aeiouAEIOU]+(?=y)|^[^aeiouqAEIOUQ]*(?:qu)?(?:Qu)?[^aeiouqAEIOUQ]*', word)[0]
    remainder = word[len(cons):]
    if (word.istitle()):
        return remainder.title() + cons.lower() + 'ay'
    return remainder + cons + 'ay'

def piginizeText(text):
    def helper(matchObj):
        return piginizeWordImproved(matchObj.group(0))
    return re.sub(r'[A-Za-z]+', helper, text)
piginizeText('My quiet yellow stylish string that should be converted to Pig Latin at once.')


Out[68]:
'Ymay ietquay ellowyay ylishstay ingstray atthay ouldshay ebay onvertedcay otay Igpay Atinlay atay onceay.'

Exercise 26)


In [69]:
text = urllib.urlopen('https://tr.wikipedia.org/wiki/%C4%B0stanbul').read()
text = re.sub(r'\<script(?:.|\n)*?\<\/script\>', '', text)
text = re.sub(r'\<style(?:.|\n)*?\<\/style\>', '', text)
soup = BeautifulSoup(text)
content = soup.get_text()
tokens = nltk.wordpunct_tokenize(content)
text = nltk.Text(tokens)
words = [w.lower() for w in text]

In [70]:
vowel_sequences = []
for word in words:
    vowels = ''.join(re.findall(r'[aeiou]', word))
    if (len(vowels) > 0):
        vowel_sequences.append(vowels)
print vowel_sequences[:50]


[u'iau', u'iiei', u'iau', u'iiei', u'aioei', u'aa', u'aiie', u'oo', u'eii', u'aa', u'eieei', u'ieee', u'eee', u'eiii', u'uuaa', u'ouu', u'uaa', u'aa', u'ua', u'aa', u'a', u'ie', u'aaa', u'ii', u'iau', u'aa', u'a', u'aaa', u'a', u'ooiaa', u'e', u'iau', u'i', u'ei', u'uaa', u'aaa', u'aa', u'e', u'aii', u'aaa', u'uei', u'iia', u'aei', u'e', u'i', u'aa', u'ee', u'i', u'eeei', u'oa']

In [71]:
bigrams = []
for vowel_seq in vowel_sequences:
    count = 0
    while (count + 1 < len(vowel_seq)):
        bigrams.append((vowel_seq[count], vowel_seq[count + 1]))
        count += 1
print bigrams[:50]


[(u'i', u'a'), (u'a', u'u'), (u'i', u'i'), (u'i', u'e'), (u'e', u'i'), (u'i', u'a'), (u'a', u'u'), (u'i', u'i'), (u'i', u'e'), (u'e', u'i'), (u'a', u'i'), (u'i', u'o'), (u'o', u'e'), (u'e', u'i'), (u'a', u'a'), (u'a', u'i'), (u'i', u'i'), (u'i', u'e'), (u'o', u'o'), (u'e', u'i'), (u'i', u'i'), (u'a', u'a'), (u'e', u'i'), (u'i', u'e'), (u'e', u'e'), (u'e', u'i'), (u'i', u'e'), (u'e', u'e'), (u'e', u'e'), (u'e', u'e'), (u'e', u'e'), (u'e', u'i'), (u'i', u'i'), (u'i', u'i'), (u'u', u'u'), (u'u', u'a'), (u'a', u'a'), (u'o', u'u'), (u'u', u'u'), (u'u', u'a'), (u'a', u'a'), (u'a', u'a'), (u'u', u'a'), (u'a', u'a'), (u'i', u'e'), (u'a', u'a'), (u'a', u'a'), (u'i', u'i'), (u'i', u'a'), (u'a', u'u')]

In [72]:
vowels = ['a', 'e', 'i', 'o', 'u']
cfd = nltk.ConditionalFreqDist(bigrams)
cfd.conditions()


Out[72]:
[u'i', u'a', u'e', u'u', u'o']

In [73]:
cfd.tabulate(conditions=vowels,samples=vowels)


     a    e    i    o    u 
a 4813  786 1975  355  880 
e  506 1805 3821  245  207 
i 1658 2884 2539  297   59 
o 1048  215  221  287  672 
u  900  154  174   50  860 

Exercise 27)


In [81]:
import random
def laugh():
    raw = ''.join(random.choice('aehh ') for x in range(500))
    return ' '.join(raw.split())
laugh()


Out[81]:
'hahhhhehea hhaa ah ehahaahe hhhhheahhhahehe ahheheea hheaeheeah hh eehahaaahhhh h e hehe aehee ehheeh hh heae hheae e a ah eaheeehhh h hh h ehhh heeehhhehhhehahhahhh haee aheheeehhahe ahaeheee e ehhaeehaahaeeehahhehehee haah ah a haaheaea eh a h hea e hahheaahh aehaeeh ahh e ahhea eea hhhh eaa hhhe a hhhh aehhehhahhhehaea eh haa eeaeh hheahe ahaah hahe h hah hhhahheahh eh haehhheehh eh ehhh a ahahhhhhhh ehhhaa aa ehe heh ahahhhaeehhh hh hhehhheehhehhae ah ahae ehheh ee hhhhahha e'

Exercise 28)


In [82]:
# three words -> woulld be compatible with splitting on whitespace
# one compound word -> would make sense semantically, may be relevant for natural language understanding applications
# nine words -> would make sense phonetically, relevant for speech processing applications

Exercise 29)


In [83]:
def ari(category):
    words = nltk.corpus.brown.words(categories=category)
    sents = nltk.corpus.brown.sents(categories=category)
    av_wordlength = sum(len(w) for w in words) / len(words)
    av_sentlength = sum(len(s) for s in sents) / len(sents)
    return (4.71 * av_wordlength) + (0.5 * av_sentlength) - 21.43
print ari('lore')
print ari('learned')
print ari('government')
print ari('romance')


10.2547561971
11.9260070433
12.084303495
4.34922419804

Exercise 30)


In [85]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
text = 'New rules allowing Sikh police officers to wear turbans instead of traditional police hats have been introduced in New York, officials say. The New York Police Department said the turbans must be navy blue and have the NYPD insignia attached. Under the new rules, religious members of the force are also permitted to grow beards up to half-an-inch long. Sikh officers have until now worn turbans under their caps. Beards have not been permitted.'
tokens = nltk.wordpunct_tokenize(text)
print [porter.stem(t) for t in tokens]
print '\n\n'
print [lancaster.stem(t) for t in tokens]


[u'New', u'rule', u'allow', u'Sikh', u'polic', u'offic', u'to', u'wear', u'turban', u'instead', u'of', u'tradit', u'polic', u'hat', u'have', u'been', u'introduc', u'in', u'New', u'York', u',', u'offici', u'say', u'.', u'The', u'New', u'York', u'Polic', u'Depart', u'said', u'the', u'turban', u'must', u'be', u'navi', u'blue', u'and', u'have', u'the', u'NYPD', u'insignia', u'attach', u'.', u'Under', u'the', u'new', u'rule', u',', u'religi', u'member', u'of', u'the', u'forc', u'are', u'also', u'permit', u'to', u'grow', u'beard', u'up', u'to', u'half', u'-', u'an', u'-', u'inch', u'long', u'.', u'Sikh', u'offic', u'have', u'until', u'now', u'worn', u'turban', u'under', u'their', u'cap', u'.', u'Beard', u'have', u'not', u'been', u'permit', u'.']



['new', 'rul', 'allow', 'sikh', 'pol', 'off', 'to', 'wear', 'turb', 'instead', 'of', 'tradit', 'pol', 'hat', 'hav', 'been', 'introduc', 'in', 'new', 'york', ',', 'off', 'say', '.', 'the', 'new', 'york', 'pol', 'depart', 'said', 'the', 'turb', 'must', 'be', 'navy', 'blu', 'and', 'hav', 'the', 'nypd', 'insign', 'attach', '.', 'und', 'the', 'new', 'rul', ',', u'religy', 'memb', 'of', 'the', 'forc', 'ar', 'also', 'permit', 'to', 'grow', 'beard', 'up', 'to', 'half', '-', 'an', '-', 'inch', 'long', '.', 'sikh', 'off', 'hav', 'until', 'now', 'worn', 'turb', 'und', 'their', 'cap', '.', 'beard', 'hav', 'not', 'been', 'permit', '.']

In [86]:
# Porter preserves upper case, uses unicode, seems to tend to longer stems

Exercise 31)


In [88]:
saying = ['After', 'all', 'is', 'said', 'and', 'done', ',', 'more', 'is', 'said', 'than', 'done', '.']
lengths = []
for w in saying:
    lengths.append(len(w))
lengths


Out[88]:
[5, 3, 2, 4, 3, 4, 1, 4, 2, 4, 4, 4, 1]

Exercise 32)


In [89]:
silly = 'newly formed bland ideas are inexpressible in an infuriating way'
# a
bland = silly.split()
print bland


['newly', 'formed', 'bland', 'ideas', 'are', 'inexpressible', 'in', 'an', 'infuriating', 'way']

In [91]:
# b
''.join(w[1] for w in bland)


Out[91]:
'eoldrnnnna'

In [92]:
# c
' '.join(bland)


Out[92]:
'newly formed bland ideas are inexpressible in an infuriating way'

In [96]:
# d
for w in sorted(bland):
    print w


an
are
bland
formed
ideas
in
inexpressible
infuriating
newly
way

Exercise 33)


In [97]:
# a
'inexpressible'.index('re')


Out[97]:
5

In [98]:
# b
words = ['this', 'is', 'a', 'dull', 'list', 'of', 'words']
words.index('dull')


Out[98]:
3

In [99]:
# c
bland[:bland.index('in')]


Out[99]:
['newly', 'formed', 'bland', 'ideas', 'are', 'inexpressible']

Exercise 34)


In [106]:
def convertNationality(adjective):
    if (adjective.endswith('dian') or adjective.endswith('ese')):
        return adjective[:-3] + 'a'
    elif (adjective.endswith('ian')):
        return adjective[:-1]
        
print convertNationality('Canadian')   
print convertNationality('Australian')
print convertNationality('Chinese')


Canada
Australia
China

Exercise 35)


In [123]:
pronouns = ['I', 'you', 'he', 'she', 'it', 'we', 'they']
corpus = ' '.join(nltk.corpus.webtext.words())
sample1 = re.findall(r'[aA]s best as (?:I|you|he|she|it|we|they) can', corpus)
print sample1[:10]
print len(sample1)
sample2 = re.findall(r'[aA]s best (?:I|you|he|she|it|we|they) can', corpus)
print sample2[:10]
print len(sample2)


[]
0
[u'as best you can']
1

Exercise 36)


In [126]:
print ' '.join(nltk.corpus.genesis.words('lolcat.txt')[:500])


Oh hai . In teh beginnin Ceiling Cat maded teh skiez An da Urfs , but he did not eated dem . Da Urfs no had shapez An haded dark face , An Ceiling Cat rode invisible bike over teh waterz . An Ceiling Cat sayed light Day An dark no Day . It were FURST !!! 1 An Ceiling Cat sayed , i can has teh firmmint wich iz funny bibel naim 4 ceiling , so wuz teh twoth day . An Ceiling Cat called no waterz urth and waters oshun . Iz good . An so teh threeth day jazzhands . An so teh furth day w00t . An so teh ... fith day . Ceiling Cat taek a wile 2 cawnt . An Ceiling Cat doed moar living stuff , mooes , An creepies , An otehr animuls , An did not eated tehm . An Ceiling Cat sayed , letz us do peeps like uz , becuz we ish teh qte , An let min p0wnz0r becuz tehy has can openers . So Ceiling Cat createded teh peeps taht waz like him , can has can openers he maed tehm , min An womin wuz maeded , but he did not eated tehm . An Ceiling Cat sed them O hai maek bebehs kthx , An p0wn teh waterz , no waterz An teh firmmint , An evry stufs . For evry createded stufs tehre are the fuudz , to the burdies , teh creepiez , An teh mooes , so tehre . It happen . Iz good . An Ceiling Cat sayed , Beholdt , teh good enouf for releaze as version 0 . 8a . kthxbai . An teh skyz an teh Urfs wur finishd , an al teh stufz in dem , an Ceiling Cat was liek al tired an stuf . Ceiling Cat blesd teh 7f day , an sed itz teh h0liez0rz ; cuz dats when he restd fum all his werk wich Ceiling Cat creatd an maed . Yay holy Caturday ! Iz how teh skyz an Urfs wur maed , wen Ceiling Cat pwnt . Urfs no can has plantz n treez n catnipz yet , cuz Ceiling Cat no can maek rainz , but iz ok for kittehs DUNT LYKEZ wetfurz . An ther wuznt ne man to mek farmz n stuf ; cuz teh clowds wur al happie an dint feel liek cryin , wich wuz ok to cuz umbrellaz wuznt inventd yut . An Ceiling Cat madez kitteh owt ov teh flore dust , an breathd ntew his nawstrils teh bref ov life , wich wuz sorta liek doin cpr on a mudpie , but it wuz al gud . An Ceiling Cat madez evry tre dat iz prity , an gud fur fud ; teh tre ov lief wuz in teh gardun to , an teh tre ov teh nawlej ov gud an evul . man askd Ceiling Cat to makez a kooki tree ,

In [137]:
def lolcat(word):
    word = re.sub(r'ight', 'iet', word)
    word = re.sub(r'^I$', 'ai', word)
    word = re.sub(r'(?<=[^aeiouAEIOU])i$', 'ai', word)
    word = re.sub(r'le$', 'el', word)
    def helper(matchObj):
        return 'e' + matchObj.group(1)
    word = re.sub(r'([^aeiouAEIOU])e$', helper, word)
    word = re.sub(r'(?<=[^aeiouAEIOU])er$', 'ah', word)
    word = re.sub(r'ou', 'ow', word)
    word = re.sub(r'Ou', 'Ow', word)
    word = re.sub(r'(?<=[^aeiouAEIOU])y$', 'eh', word)
    word = re.sub(r'th', 'f', word)
    word = re.sub(r'Th', 'F', word)
    word = re.sub(r'ing$', 'in', word)
    return word    
print lolcat('I')
print lolcat('hi')
print lolcat('right')
print lolcat('kite')
print lolcat('like')
print lolcat('over')
print lolcat('loud')
print lolcat('kitty')
print lolcat('three')
print lolcat('nothing')
print lolcat('little')


ai
hai
riet
kiet
liek
ovah
lowd
kitteh
free
nofin
littel

Exercise 37)


In [138]:
help(re.sub)


Help on function sub in module re:

sub(pattern, repl, string, count=0, flags=0)
    Return the string obtained by replacing the leftmost
    non-overlapping occurrences of the pattern in string by the
    replacement repl.  repl can be either a string or a callable;
    if a string, backslash escapes in it are processed.  If it is
    a callable, it's passed the match object and must return
    a replacement string to be used.


In [140]:
def clean(html):
    # remove html tags:
    text = re.sub(r'\<.*?\>', '', html)
    # normalize whitespace:
    text = re.sub(r'\s+', ' ', text)
    return text
clean('<span class="some class">A span    which  should<br> be cleaned</span>')


Out[140]:
'A span which should be cleaned'

Exercise 38)


In [142]:
# a
text = 'some text with long-\nterm and encyclo-\npedia'
words = re.findall(r'\w+\-\n\w+', text)
words


Out[142]:
['long-\nterm', 'encyclo-\npedia']

In [143]:
# b
for w in words:
    print re.sub('\n', '', w)


long-term
encyclo-pedia

In [145]:
# c
for w in words:
    word = re.sub('\n', '', w)
    parts = word.lower().split('-')
    if (parts[0] not in nltk.corpus.words.words() and parts[1] not in nltk.corpus.words.words()):
        print re.sub('\-', '', word)
    else:
        print word


long-term
encyclopedia

Exercise 39)


In [148]:
def soundex(name):
    first = name[0]
    # remove w & h
    encoded = first.lower() + re.sub('[wh]', '', name[1:].lower())
    # replace consonants with numbers
    encoded = re.sub(r'[bfpv]', '1', encoded)
    encoded = re.sub(r'[cgjkqsxz]', '2', encoded)
    encoded = re.sub(r'[dt]', '3', encoded)
    encoded = re.sub(r'l', '4', encoded)
    encoded = re.sub(r'[mn]', '5', encoded)
    encoded = re.sub(r'r', '6', encoded)
    # merge adjacent same digits into one
    count = 1
    while count < 7:
        encoded = re.sub(str(count) + '{2,}', str(count), encoded)
        count += 1
    # remove vowels
    encoded = encoded[0].upper() + re.sub('[aeiouy]', '', encoded[1:])
    # if first character is digit, replace it with the saved letter
    if (encoded[0].isdigit()):
        encoded = first.upper() + encoded[1:]
    # encoded must contain 3 digits -> fill it up with zeros if too short    
    if (len(encoded) < 4):
        encoded += '000'
    return encoded[:4]    
    
print soundex('Robert') #R163
print soundex('Rupert') #R163
print soundex('Rubin') #R150
print soundex('Ashcraft') #A261
print soundex('Ashcroft') #A261
print soundex('Tymczak') #T522 
print soundex('Pfister') #P236


R163
R163
R150
A261
A261
T522
P236

Exercise 40)


In [150]:
def ari(raw):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sents = sent_tokenizer.tokenize(raw)
    words = nltk.word_tokenize(raw)
    av_wordlength = sum(len(w) for w in words) / len(words)
    av_sentlength = sum(len(s) for s in sents) / len(sents)
    return (4.71 * av_wordlength) + (0.5 * av_sentlength) - 21.43
print ari(nltk.corpus.abc.raw("rural.txt"))
print ari(nltk.corpus.abc.raw("science.txt"))


68.5700668936
68.8154763376

Exercise 41)


In [152]:
words = ['attribution', 'confabulation', 'elocution', 'sequoia', 'tenacious', 'unidirectional']
# more elegant with regular expression instead of nested list comprehension:
vsequences = set([''.join(re.findall(r'[aeiou]', word)) for word in words])
sorted(vsequences)


Out[152]:
['aiuio', 'eaiou', 'eouio', 'euoia', 'oauaio', 'uiieioa']

In [153]:
# nested list comprehension:
vsequences = set([''.join([char for char in word if char in 'aeiou']) for word in words])
sorted(vsequences)


Out[153]:
['aiuio', 'eaiou', 'eouio', 'euoia', 'oauaio', 'uiieioa']

Exercise 42)


In [163]:
from nltk.corpus import wordnet as wn
class IndexedText(object):
    def __init__(self, stemmer, text):
        self._text = text
        self._stemmer = stemmer
        self._index = nltk.Index((self._stem(word), i)
            for (i, word) in enumerate(text))

    def concordance(self, word, width=40):
        key = self._stem(word)
        wc = int(width/4) # words of context
        for i in self._index[key]:
            lcontext = ' '.join(self._text[i-wc:i])
            rcontext = ' '.join(self._text[i:i+wc])
            offset = '(WordNet Offset: ' + str(wn.synsets(self._text[i])[0].offset()) + ')'
            ldisplay = '%*s' % (width, lcontext[-width:])
            rdisplay = '%-*s' % (width, rcontext[:width])
            print ldisplay, rdisplay, offset
                
    def _stem(self, word):
        return self._stemmer.stem(word).lower()

porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')
text = IndexedText(porter, grail)
text.concordance('lie')


r king ! DENNIS : Listen , strange women lying in ponds distributing swords is no (WordNet Offset: 751944)
 beat a very brave retreat . ROBIN : All lies ! MINSTREL : [ singing ] Bravest of (WordNet Offset: 6756831)
       Nay . Nay . Come . Come . You may lie here . Oh , but you are wounded !    (WordNet Offset: 6756831)
doctors immediately ! No , no , please ! Lie down . [ clap clap ] PIGLET : Well   (WordNet Offset: 6756831)
ere is much danger , for beyond the cave lies the Gorge of Eternal Peril , which  (WordNet Offset: 6756831)
   you . Oh ... TIM : To the north there lies a cave -- the cave of Caerbannog -- (WordNet Offset: 6756831)
h it and lived ! Bones of full fifty men lie strewn about its lair . So , brave k (WordNet Offset: 6756831)
not stop our fight ' til each one of you lies dead , and the Holy Grail returns t (WordNet Offset: 6756831)

Exercise 43)


In [181]:
def guessLanguage(text):
    tokens = nltk.wordpunct_tokenize(text)
    text = nltk.Text(tokens)
    fdist_text = nltk.FreqDist(text)
    best_guess = ('', 0)
    best_intersection = []
    for lang in nltk.corpus.udhr.fileids():
        if (lang.endswith('-Latin1')):
            fdist_lang = nltk.FreqDist(nltk.corpus.udhr.words(lang))
            intersection = list(set(fdist_text.keys()) & set(fdist_lang.keys()))
            dict_text = []
            dict_lang = []
            for word in intersection:
                dict_text.append((word, fdist_text[word]))
                dict_lang.append((word, fdist_lang[word]))
            spearman = nltk.spearman_correlation(dict_text, dict_lang)
            if ((best_guess[1] == 0 and spearman != 0.0) or (spearman != 0.0 and spearman > best_guess[1])):
                best_guess = (lang[:-7], spearman)
    return best_guess[0];

help(nltk.spearman_correlation)
print guessLanguage('This is clearly an example of English text which should not be hard to recognize.')
print guessLanguage(u'Carapax (von gr. charax „Befestigungsanlage“, „Palisade“ und pagios „fest“; Plural: Carapaces) ist eine Bezeichnung für eine bei verschiedenen Tiergruppen (Taxa) unabhängig voneinander entstandene harte Bedeckung der Körperoberseite. Bei Schildkröten heißt der Carapax gemeinsprachlich Rückenschild oder Rückenpanzer, bei Krustentieren (Krebstieren in der Küche) ist er ein Teil der „Schale“. Viele Krebstiere (Crustacea) besitzen eine Hautfalte, die vom Kopfhinterrand (Segment der 2. Maxille) ausgeht; diese kann auch primär (z. B. Cephalocarida) oder sekundär (z. B. Asseln und Flohkrebse) fehlen, gehört also nicht zum Grundbauplan der Krebstiere. Vielfach ist die chitinöse Kopffalte durch eingelagerten Kalk panzerartig versteift, vor allem bei vielen Zehnfußkrebsen. Bedeckt diese Struktur als Rückenschild einige oder ggf. alle Rumpfsegmente, wird sie Carapax genannt. Der Carapax schließt also an den Kopf an, setzt sich über dessen Hinterrand hinaus fort und erstreckt sich mehr oder weniger weit über den Rumpf des Krebses. Je nach Ausbildung kann er auch den Kopf selbst umhüllen (z. B. bei den Muschelkrebsen) und mehr oder weniger weit auch seitlich herabgezogen sein.  – Zum Artikel …')


Help on function spearman_correlation in module nltk.metrics.spearman:

spearman_correlation(ranks1, ranks2)
    Returns the Spearman correlation coefficient for two rankings, which
    should be dicts or sequences of (key, rank). The coefficient ranges from
    -1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only
    calculated for keys in both rankings (for meaningful results, remove keys
    present in only one list before ranking).

English
German_Deutsch

In [182]:
print guessLanguage(u'Dødsstraf eller livsstraf er henrettelse som straf for en forbrydelse. I de jurisdiktioner, der praktiserer dødsstraf, er den som regel forbeholdt et lille antal alvorlige forbrydelser, ofte overlagt mord og landsforræderi. I Kina praktiseres tillige dødsstraf for økonomisk kriminalitet og narkokriminalitet, og i Iran for homoseksualitet, ligesom der i visse områder kontrolleret af islamiske oprørsbevægelser gennemføres henrettelser baseret på en streng fortolkning af sharia. Mange lande har dødsstraf i den militære straffelov eller for forbrydelser begået i krigstid. I Danmark blev dødsstraf første gang afskaffet i den borgerlige straffelov den 15. april 1930. Loven trådte i kraft 15. april 1933. Dødsstraf blev på dette tidspunkt beholdt i den militære straffelov. I forbindelse med retsopgøret efter 2. verdenskrig genindførtes dødsstraffen (som kaldtes livsstraf) i 1945 for forbrydelser begået under besættelsen. Loven var en særlov og kendes som Landsforræderloven eller retteligen Straffelovstillægget og havde tilbagevirkende kraft for handlinger begået efter 9. april 1940. 46 personer blev på den baggrund henrettet af frivillige politifolk. Den 20. juli 1950 kl. 01:00 blev Ib Birkedal Hansen henrettet som den sidste i Danmark. (Læs mere..)')


Danish_Dansk

Exercise 44)


In [189]:
def novel_sense(word, text):
    content_words = []
    stopwords = nltk.corpus.stopwords.words('english')
    count = 0
    for w in text:
        if (w.isalpha() and w not in stopwords):
            content_words.append((w, count))
        count += 1    
    count = 0
    oddest = False
    for w in content_words:
        if (w[0] == word):
            count_comparisons = 0
            overall_sim = 0
            for synset in wn.synsets(w[0]):
                # compare to words in context on left side:
                for index in range(1, min(21, count+1)):
                    context_word = content_words[count - index][0]
                    for context_synset in wn.synsets(context_word):
                        path_sim = synset.path_similarity(context_synset)
                        if (path_sim != None):
                            overall_sim += path_sim 
                            count_comparisons += 1
                # compare to words in context on right side:            
                for index in range(1, min(21, len(content_words)-count-1)):
                        context_word = content_words[count + index][0]
                        for context_synset in wn.synsets(context_word):
                            path_sim = synset.path_similarity(context_synset)
                            if (path_sim != None):
                                overall_sim += path_sim 
                                count_comparisons += 1            
            av_sim = overall_sim / count_comparisons
            if (oddest == False or oddest[1] > av_sim):
                oddest = (w[1], av_sim) # w[1] = original index of the word in the text
        count += 1
    if (oddest != False):    
        print text[max(0, oddest[0] - 50):min(oddest[0] + 50, len(text))]
        print 'Average Similarity: ', str(oddest[1])

novel_sense('love', nltk.corpus.gutenberg.words('austen-emma.txt'))


[u'Jane', u'Fairfax', u'therefore', u'that', u'he', u'would', u'have', u'preferred', u'the', u'society', u'of', u'William', u'Larkins', u'.', u'No', u'!--', u'she', u'was', u'more', u'and', u'more', u'convinced', u'that', u'Mrs', u'.', u'Weston', u'was', u'quite', u'mistaken', u'in', u'that', u'surmise', u'.', u'There', u'was', u'a', u'great', u'deal', u'of', u'friendly', u'and', u'of', u'compassionate', u'attachment', u'on', u'his', u'side', u'--', u'but', u'no', u'love', u'.', u'Alas', u'!', u'there', u'was', u'soon', u'no', u'leisure', u'for', u'quarrelling', u'with', u'Mr', u'.', u'Knightley', u'.', u'Two', u'days', u'of', u'joyful', u'security', u'were', u'immediately', u'followed', u'by', u'the', u'over', u'-', u'throw', u'of', u'every', u'thing', u'.', u'A', u'letter', u'arrived', u'from', u'Mr', u'.', u'Churchill', u'to', u'urge', u'his', u'nephew', u"'", u's', u'instant', u'return', u'.', u'Mrs']
Average Similarity:  0.12389011066