In [89]:
#direct from iPython Notebook

#1 If you’ve done excel then you have done functional programming
#  Port excel work into python script

line1 = range(1, 9)
line2 = range(10, 50, 5)
#print test to see what we have correlates to excel data
print(line1, line2)

total = sum(line1) + sum(line2)

#have states changed?
print(line1, line2)

#or
print(sum(range(1,9)) + sum(range(10,50,5)))

#have states changed?
print(line1, line2)

#or 
from itertools import chain
print(sum(chain(range(1,9), range(10,50,5))))

#have states changed?
print(line1, line2)
 
#2 discussion of pros and cons of the above code and keep these principles in mind for the folowing


#3 brief overview of generators and comprehensions
#  intro to iterators terminology, list comps, and generators
iterable = range(20)
#i is the iterator
for i in iterable:
    print(i*2)

alpha_iter = "abcdefghijksmellonop"
for i in alpha_iter:
    print(i*2)

#but if we want it in a data structure that we can manipulate further
def get_double_letters(ltrs):
    doubles = list()
    for i in alpha_iter:
        doubles.append(i*2)
    return doubles

#list comprehensions
double_letters = [ltr*2 for ltr in alpha_iter]

#generator function
def gen_double_letters(ltrs):
    for i in alpha_iter:
        yield i*2    
 
#won't work btw
#for i in alpha_iter:
#    yield i*2        

print(get_double_letters(alpha_iter))
print(gen_double_letters(alpha_iter))

gen_object = gen_double_letters(alpha_iter)

gen_object.next()
gen_object.next()

list(gen_object)

#gen expressions! write them backwards at first
def gen_dubs(ltrs):
    for i in (ltr*2 for ltr in ltrs):
        yield i

#4 basic appiclation: compare cleaning text methods on larger text corpuses converted into iterables for processing
#let's apply this to our sample text 

#just test that we are in proper directory with proper files
with open("obama_09.txt") as fp:
    o_09 = fp.read()
with open("obama_14.txt") as fp:
    o_14 = fp.read()

#taken from augemented nltk list and experience
stop_words = [' ', " '", '!', '"', '#', '&', "'", "'re", "'s", '(', ')', '*', '+', ',', '-', '--', '.', '...', '/', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '``', 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'amp', 'an', 'and', 'any', 'are', 'as', 'at', 'b', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'c', 'can', 'd', 'did', 'do', 'does', 'doing', 'don', 'down', 'during', 'e', 'each', 'f', 'few', 'for', 'from', 'further', 'g', 'h', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 'j', 'just', 'k', 'l', 'm', 'me', 'more', 'most', 'must', 'my', 'myself', 'n', "n't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'p', 'q', 'r', 'raquo', 's', 'same', 'she', 'should', 'so', 'some', 'such', 't', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'u', 'under', 'until', 'up', 'v', 'very', 'w', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'x', 'y', 'you', 'your', 'yours', 'yourself', 'yourselves', 'z', '|']
support_words = ['about', 'above', 'after', 'again', 'against', 'all', 'am', 'amp', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'did', 'do', 'does', 'doing', 'don', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 'just', 'me', 'more', 'most', 'must', 'my', 'myself', "n't", 'no', 'nor', 'not', 'now', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'raquo', 'same', 'she', 'should', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'you', 'your', 'yours', 'yourself', 'yourselves']


def imp_clean_text(text, remove_words=(stop_words, support_words)):
    with open(text) as fp:
        full_text = fp.read()
    full_text_ls = full_text.split()
    clean_text = list()
    for word in full_text_ls:
        if word not in stop_words and support_words:
            clean_text.append(word) 
    return clean_text

#what code will look like without introducing concepts first
def func_clean_text(text, remove_words=(stop_words, support_words)):
    with open(text) as fp:
        full = fp.read()
    for i in (word for word in full.split() if word not in stop_words and support_words):
        yield i

imp_14 = imp_clean_text("obama_14.txt", remove_words=(stop_words, support_words))
func_14 = func_clean_text("obama_14.txt", remove_words=(stop_words, support_words))
print(len(imp_14))
print(len(list(func_14)))

#what will this give us knowing what we can remember about generators
print(imp_14 == list(func_14))

#how about this?
print(imp_14 == list(func_clean_text("obama_14.txt", remove_words=(stop_words, support_words))))

#use time.timeit method in python interpreter
# %timeit magic magic method is only for ipython users
#%timeit imp_clean_text("obama_09.txt")
#%timeit func_clean_text("obama_09.txt")

        
        

#5 deeper dive application: cleaning and manipulating raw text data into consummable information for end user
#quick itertools intro and begin text processing

#everything in itertools yields a generator!

from itertools import chain
def func_clean_text(text, remove_words=(stop_words, support_words)):
    with open(text) as fp:
        full = fp.read()
    for i in (word for word in full.split() if word not in chain(stop_words, support_words)):
        yield i


#heavier duty gen expressions

def gen_tokens(text):
    """ str -> gen 
    call split on our text when its a string
    """
    with open(text) as fp:
        fp = fp.read()
        for word in fp.split():
            yield word

def gen_cleaner_words(text):
    """ str -> str 
    removes commonly 'attached' punctuation marks such as ',''.''*word*' etc.
    note that this function does not remove any words from iterable
    """
    for i in gen_tokens(text):
        yield i.translate(None, "!@#$%^&*().,[]+=-_`~<>?:;")

def gen_lowercase_tokens(text):
    """ str -> gen 
    maintain generator type for lowercase words, 
    useful for quick comparisons against certain stop word lists
    """
    for word in gen_tokens(text):
        yield word.lower()

def gen_ns_words(text):
    """ str -> gen 
    need non stop words before non support words
    """
    for i in (word for word in gen_cleaner_words(text) if word.lower() not in stop_words):
        yield i

def gen_nsup_words_only(text):
    """str -> gen
    generate words not in support words, mostly a utility function
    """
    for i in (word for word in gen_tokens(text) if word.lower() not in support_words):
        yield i
        
#note for this module's purpose the default type for stop_words will be lowercase
def gen_ns_and_nsup_words(text):
    """ str -> gen 
    generate words not in stop or support words list 
    """
    for i in (word for word in gen_ns_words(text) if word.lower() not in support_words):
        yield i

#Analysis functions
def get_text_len_and_content_len(text):
    """ str -> float, float
    return length of text list with stop_words omitted and length of 'content'
    defined as non 'support words' """ 
    num_ns_tokens = float(len(list(gen_ns_words(text))))
    num_content_tokens = float(len(list(gen_ns_and_nsup_words(text))))
    return (num_ns_tokens, num_content_tokens)

def get_content_percentage(text):
    """ str -> float
    number of words in text - stop words and support words / number of tokens in text. 
    Note the difference between this ratio and the lexical diversity ratio, 
    which only accounts for unique words.
    """  
    try: 
        result = float(len(list(gen_ns_and_nsup_words(text)))) / float(len(list(gen_tokens(text))))
        result = round(result, 2)
    except ZeroDivisionError:
        pass
        result = 0
    finally:
        return result

#print(list(chain(stop_words, support_words)))


#test by print statement
f1 = func_clean_text("obama_14.txt")
#print(list(f1))
f2 = gen_ns_and_nsup_words("obama_14.txt")
#print(list(f2))

#print "up" in f1


([1, 2, 3, 4, 5, 6, 7, 8], [10, 15, 20, 25, 30, 35, 40, 45])
([1, 2, 3, 4, 5, 6, 7, 8], [10, 15, 20, 25, 30, 35, 40, 45])
256
([1, 2, 3, 4, 5, 6, 7, 8], [10, 15, 20, 25, 30, 35, 40, 45])
256
([1, 2, 3, 4, 5, 6, 7, 8], [10, 15, 20, 25, 30, 35, 40, 45])
0
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
aa
bb
cc
dd
ee
ff
gg
hh
ii
jj
kk
ss
mm
ee
ll
ll
oo
nn
oo
pp
['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii', 'jj', 'kk', 'ss', 'mm', 'ee', 'll', 'll', 'oo', 'nn', 'oo', 'pp']
<generator object gen_double_letters at 0x1056fedc0>
4319
4319
False
True

In [7]:
get_text_len_and_content_len("obama_09.txt")


Out[7]:
(3200.0, 3200.0)

In [16]:
print len(o_09.split())
print len(list(gen_ns_words("obama_09.txt")))
print len(list(gen_ns_and_nsup_words("obama_09.txt")))


6260
3200
3200

In [13]:
get_content_percentage("obama_09.txt")


Out[13]:
0.51

In [14]:
get_text_len_and_content_len("obama_09.txt")


Out[14]:
(3200.0, 3200.0)

In [19]:
print len(list(gen_cleaner_words("obama_09.txt")))


6260

In [23]:
#test each function, like gen_non-punc_words

In [24]:
print support_words


['about', 'above', 'after', 'again', 'against', 'all', 'am', 'amp', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'did', 'do', 'does', 'doing', 'don', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 'just', 'me', 'more', 'most', 'must', 'my', 'myself', "n't", 'no', 'nor', 'not', 'now', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'raquo', 'same', 'she', 'should', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'you', 'your', 'yours', 'yourself', 'yourselves']

In [26]:
def gen_nsup_words_only(text):
    """str -> gen
    generate words not in support words, mostly a utility function
    """
    for i in (word for word in gen_tokens(text) if word.lower() not in support_words):
        yield i

In [27]:
gen_nsup_words_only("obama_09.txt")


Out[27]:
<generator object gen_nsup_words_only at 0x1056d58c0>

In [28]:
len(list(gen_nsup_words_only("obama_09.txt")))


Out[28]:
3502

In [29]:
len(list(gen_ns_words("obama_09.txt")))


Out[29]:
3194

In [31]:
len(list(gen_ns_and_nsup_words("obama_09.txt")))


Out[31]:
3194

In [32]:
get_text_len_and_content_len("obama_09.txt")


Out[32]:
(3194.0, 3194.0)

In [35]:
from collections import Counter

In [41]:
ctr_obama_09 = Counter(gen_ns_and_nsup_words("obama_09.txt"))
ctr_obama_14 = Counter(gen_ns_and_nsup_words("obama_14.txt"))

In [68]:
#print(ctr_obama_09)
ctr_obama_09?

In [45]:
#now for a bit more comparative analysis with a basic data type

In [47]:
set_obama_09 = set(gen_ns_and_nsup_words("obama_09.txt"))
set_obama_14 = set(gen_ns_and_nsup_words("obama_14.txt"))

In [63]:
print len(list(gen_tokens("obama_09.txt")))
print len(set_obama_09)


6260
1436

In [64]:
print len(list(gen_tokens("obama_14.txt")))
print len(set_obama_14)


7226
1821

In [65]:
#NLP metric a la George Lakoff
def get_rough_keyword_ratio(text):
    return float(len(set(gen_ns_and_nsup_words(text)))) / float(len(list(gen_tokens(text)))) * 100

In [66]:
get_rough_keyword_ratio("obama_09.txt")


Out[66]:
22.93929712460064

In [67]:
get_rough_keyword_ratio("obama_14.txt")


Out[67]:
25.200664267921397

In [69]:
#a little fun with itertools
from itertools import *

In [71]:
ifilter(len() > 10, gen_ns_and_nsup_words("obama_09.txt"))


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-71-db60b2f69871> in <module>()
----> 1 ifilter(len() > 10, gen_ns_and_nsup_words("obama_09.txt"))

TypeError: len() takes exactly one argument (0 given)

In [76]:
#ctr_obama_09[:10]  Type Error

In [77]:
for i in islice(ctr_obama_09, 10):
    print(i)


personally
four
earmarks
hanging
saved
worked
whose
fouryear
teaching

In [83]:
#frozenset(ctr_obama_09)

In [87]:
#funny pairs a la Louis Van Ahn's captcha?
#print(list(izip(set_obama_09, set_obama_14)))

In [88]:
#well its hard to apply these principles to text, most are meant for numbers
#see http://pymotw.com/2/itertools/


  File "<ipython-input-88-e0c5541d846b>", line 2
    http://pymotw.com/2/itertools/
        ^
SyntaxError: invalid syntax

In [ ]: