In [84]:
def freq(word, doc):
    return doc.count(word)
 
 
def word_count(doc):
    return len(doc.split())
 
 
def tf(word, doc):
    return ( freq(word, doc) / float(word_count(doc)) ) 
 
 
def num_docs_containing(word, list_of_docs):
    count = 0
    for document in list_of_docs:
        if freq(word, document) > 0:
            count += 1
    return 1 + count
 
 
def idf(word, list_of_docs):
    num = len(list_of_docs)
    #print num
    den = float( num_docs_containing(word, list_of_docs) )
    #print den
    return math.log( num / den )

In [91]:
d0 = """Python is a 2000 made-for-TV horror movie directed by Richard
Clabaugh. The film features several cult favorite actors, including William
Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy,
Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the
A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean
Whalen. The film concerns a genetically engineered snake, a python, that
escapes and unleashes itself on a small town. It includes the classic final
girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles,
 California and Malibu, California. Python was followed by two sequels: Python
 II (2002) and Boa vs. Python (2004), both also made-for-TV films."""

d1 = """Python is a genus of
nonvenomous pythons found in Africa and Asia. Currently, 7 species are
recognised. A member of this genus, P. reticulatus, is among the longest
snakes known. This snake is also very scary and I never want to see one up close. 
Which is scariest, pythons, sharks or ninjas?"""

d2 = """The Colt Python is a .357 Magnum caliber revolver formerly
manufactured by Colt's Manufacturing Company of Hartford, Connecticut.
It is sometimes referred to as a "Combat Magnum". It was first introduced
in 1955, the same year as Smith & Wesson's M29 .44 Magnum. The now discontinued
Colt Python targeted the premium revolver market segment. Some firearm
collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy
Thompson, Renee Smeets and Martin Dougherty have described the Python as the
finest production revolver ever made."""

#d3 = """I am a truly small text"""
d3 = """The fossil record of snakes is relatively poor because snake skeletons 
are typically small and fragile making fossilization uncommon. Fossils readily 
identifiable as snakes (though often retaining hind limbs) first appear in the 
fossil record during the Cretaceous period. The earliest known true snake 
fossils (members of the crown group Serpentes) come from the marine simoliophiids, 
the oldest of which is the Late Cretaceous (Cenomanian age) Haasiophis terrasanctus,
dated to between 112 and 94 million years old. Based on comparative anatomy, there 
is consensus that snakes descended from lizards. Pythons and boas—primitive 
groups among modern snakes—have vestigial hind limbs: tiny, clawed digits known as anal 
spurs, which are used to grasp during mating. The families Leptotyphlopidae 
and Typhlopidae also possess remnants of the pelvic girdle, appearing as horny projections 
when visible."""

d4 = """The potato is a starchy, tuberous crop from the perennial nightshade 
Solanum tuberosum L. The word "potato" may refer either to the plant itself 
or to the edible tuber. In the Andes, where the species is indigenous, there
are some other closely related cultivated potato species. Potatoes were introduced
outside the Andes region approximately four centuries ago, and have since 
become an integral part of much of the world's food supply. It is the world's 
fourth-largest food crop, following maize, wheat, and rice."""

corpus = [d0,d1,d2,d3,d4]

def all_freq(word):
    for i,doc in enumerate(corpus):
        print 'document',i, 'has the word "'+word+'":\t', freq(word,doc),'times out of',word_count(doc)
        
def all_tf(word):
    for i,doc in enumerate(corpus):
        print 'document',i, 'has TF for word "'+word+'":\t', tf(word,doc)

In [92]:
word = 'Python'
#all_freq(word)
print ''
all_tf(word)
print ''
print word,'idf 3 docs:\t',idf(word,corpus[:3]),'\tidf all docs:',idf(word,corpus)
print '\n'

word = 'the'
#all_freq(word)
print ''
all_tf(word)
print ''
print word,'idf 3 docs:\t',idf(word,corpus[:3]),'\tidf all docs:',idf(word,corpus)
print '\n'

word = 'truly'
#all_freq(word)
print ''
all_tf(word)
print ''
print word,'idf 3 docs:\t',idf(word,corpus[:3]),'\tidf all docs:',idf(word,corpus)
print '\n'


document 0 has TF for word "Python":	0.0327868852459
document 1 has TF for word "Python":	0.0192307692308
document 2 has TF for word "Python":	0.0348837209302
document 3 has TF for word "Python":	0.00763358778626
document 4 has TF for word "Python":	0.0

Python idf 3 docs:	-0.287682072452 	idf all docs: 0.0



document 0 has TF for word "the":	0.0245901639344
document 1 has TF for word "the":	0.0192307692308
document 2 has TF for word "the":	0.046511627907
document 3 has TF for word "the":	0.0610687022901
document 4 has TF for word "the":	0.132530120482

the idf 3 docs:	-0.287682072452 	idf all docs: -0.182321556794



document 0 has TF for word "truly":	0.0
document 1 has TF for word "truly":	0.0
document 2 has TF for word "truly":	0.0
document 3 has TF for word "truly":	0.0
document 4 has TF for word "truly":	0.0

truly idf 3 docs:	1.09861228867 	idf all docs: 1.60943791243



In [93]:
def treat(doc):
    doc = doc.lower()
    doc = doc.replace('.','')
    doc = doc.replace('-', ' ')
    doc = doc.replace(',','')
    doc = doc.replace('?','')
    return doc

In [94]:
from operator import itemgetter

res = {}
for doc in corpus:
    doc = treat(doc)
    for word in doc.split():
        word_idf = idf(word, corpus)
        word_tf = tf(word, doc)
        res[word] = word_idf * word_tf
final = sorted(res.items(), key=itemgetter(1))

for xx in final[::-1]:
    print xx


(u'magnum', 0.05614318299188722)
(u'colt', 0.05614318299188722)
(u'potato', 0.04363289199400738)
(u'film', 0.04363289199400738)
(u'andes', 0.03831995029605)
(u'this', 0.03524195122592905)
(u'genus', 0.03524195122592905)
(u'fossil', 0.03497292869748684)
(u'tuber', 0.03272466899550554)
(u'revolver', 0.03196363018165657)
(u'africa', 0.03095072908527116)
(u'asia', 0.03095072908527116)
(u'currently', 0.03095072908527116)
(u'wil', 0.025546633530700004)
(u'california', 0.025546633530700004)
(u'tv', 0.025546633530700004)
(u'cretaceous', 0.024571571182200002)
(u"world's", 0.02181644599700369)
(u'crop', 0.02181644599700369)
(u'films', 0.02181644599700369)
(u'four', 0.02181644599700369)
(u'food', 0.02181644599700369)
(u'for', 0.020270858085952012)
(u'solanum', 0.019159975148025)
(u'potatoes', 0.019159975148025)
(u'"combat', 0.01871439433062907)
(u"colt's", 0.01871439433062907)
(u"wesson's", 0.01871439433062907)
(u'leroy', 0.01871439433062907)
(u'smeets', 0.01871439433062907)
(u'dougherty', 0.01871439433062907)
(u'hartford', 0.01871439433062907)
(u'connecticut', 0.01871439433062907)
(u'company', 0.01871439433062907)
(u'jeff', 0.01871439433062907)
(u'hawks', 0.01871439433062907)
(u'm29', 0.01871439433062907)
(u'renee', 0.01871439433062907)
(u'manufacturing', 0.01871439433062907)
(u'cooper', 0.01871439433062907)
(u'martin', 0.01871439433062907)
(u'chuck', 0.01871439433062907)
(u'thompson', 0.01871439433062907)
(u'smith', 0.01871439433062907)
(u'hogg', 0.01871439433062907)
(u'magnum"', 0.01871439433062907)
(u'python', 0.017819498503464794)
(u'longest', 0.017620975612964523)
(u'very', 0.017620975612964523)
(u'want', 0.017620975612964523)
(u'nonvenomous', 0.017620975612964523)
(u'scary', 0.017620975612964523)
(u'scariest', 0.017620975612964523)
(u'recognised', 0.017620975612964523)
(u'one', 0.017620975612964523)
(u'never', 0.017620975612964523)
(u'see', 0.017620975612964523)
(u'found', 0.017620975612964523)
(u'reticulatus', 0.017620975612964523)
(u'ninjas', 0.017620975612964523)
(u'sharks', 0.017620975612964523)
(u'snakes', 0.015597728969953914)
(u'which', 0.013989171478994734)
(u'hind', 0.013989171478994734)
(u'typhlopidae', 0.013989171478994734)
(u'record', 0.013989171478994734)
(u'appear', 0.013989171478994734)
(u'old', 0.013989171478994734)
(u'fossils', 0.013989171478994734)
(u'during', 0.013989171478994734)
(u'group', 0.013989171478994734)
(u'nightmare', 0.012773316765350002)
(u'englund', 0.012773316765350002)
(u'kid', 0.012773316765350002)
(u'dien', 0.012773316765350002)
(u'william', 0.012773316765350002)
(u'wheaton', 0.012773316765350002)
(u'bowe', 0.012773316765350002)
(u'coogan', 0.012773316765350002)
(u'jenny', 0.012773316765350002)
(u'dana', 0.012773316765350002)
(u'zabka', 0.012773316765350002)
(u'freddy', 0.012773316765350002)
(u'malibu', 0.012773316765350002)
(u'whalen', 0.012773316765350002)
(u'van', 0.012773316765350002)
(u'angeles', 0.012773316765350002)
(u'david', 0.012773316765350002)
(u'keith', 0.012773316765350002)
(u'friday', 0.012773316765350002)
(u'mccarthy', 0.012773316765350002)
(u'robert', 0.012773316765350002)
(u'richard', 0.012773316765350002)
(u'karate', 0.012773316765350002)
(u'sean', 0.012773316765350002)
(u'street', 0.012773316765350002)
(u'krueger', 0.012773316765350002)
(u'barron', 0.012773316765350002)
(u'clabaugh', 0.012773316765350002)
(u'elm', 0.012773316765350002)
(u'casper', 0.012773316765350002)
(u'based', 0.012285785591100001)
(u'(cenomanian', 0.012285785591100001)
(u'haasiophis', 0.012285785591100001)
(u'serpentes)', 0.012285785591100001)
(u'leptotyphlopidae', 0.012285785591100001)
(u'species', 0.012162514851571207)
(u'region', 0.010908222998501846)
(u'integral', 0.010908222998501846)
(u'starchy', 0.010908222998501846)
(u'may', 0.010908222998501846)
(u'plant', 0.010908222998501846)
(u'largest', 0.010908222998501846)
(u'nightshade', 0.010908222998501846)
(u'following', 0.010908222998501846)
(u'other', 0.010908222998501846)
(u'edible', 0.010908222998501846)
(u'word', 0.010908222998501846)
(u'perennial', 0.010908222998501846)
(u'part', 0.010908222998501846)
(u'since', 0.010908222998501846)
(u'tuberosum', 0.010908222998501846)
(u'much', 0.010908222998501846)
(u'"potato"', 0.010908222998501846)
(u'approximately', 0.010908222998501846)
(u'cultivated', 0.010908222998501846)
(u'ago', 0.010908222998501846)
(u'become', 0.010908222998501846)
(u'maize', 0.010908222998501846)
(u'wheat', 0.010908222998501846)
(u'outside', 0.010908222998501846)
(u'tuberous', 0.010908222998501846)
(u'centuries', 0.010908222998501846)
(u'rice', 0.010908222998501846)
(u'related', 0.010908222998501846)
(u'where', 0.010908222998501846)
(u'either', 0.010908222998501846)
(u'fourth', 0.010908222998501846)
(u'closely', 0.010908222998501846)
(u'were', 0.010908222998501846)
(u'indigenous', 0.010908222998501846)
(u'supply', 0.010908222998501846)
(u'discontinued', 0.010654543393885524)
(u'sometimes', 0.010654543393885524)
(u'segment', 0.010654543393885524)
(u'such', 0.010654543393885524)
(u'caliber', 0.010654543393885524)
(u'same', 0.010654543393885524)
(u'44', 0.010654543393885524)
(u'production', 0.010654543393885524)
(u'ian', 0.010654543393885524)
(u'market', 0.010654543393885524)
(u'firearm', 0.010654543393885524)
(u'357', 0.010654543393885524)
(u'formerly', 0.010654543393885524)
(u'&', 0.010654543393885524)
(u'described', 0.010654543393885524)
(u'manufactured', 0.010654543393885524)
(u'targeted', 0.010654543393885524)
(u'collectors', 0.010654543393885524)
(u'referred', 0.010654543393885524)
(u'1955', 0.010654543393885524)
(u'premium', 0.010654543393885524)
(u'writers', 0.010654543393885524)
(u'finest', 0.010654543393885524)
(u'snake', 0.010220315327368387)
(u'member', 0.009823569687807515)
(u'7', 0.009823569687807515)
(u'close', 0.009823569687807515)
(u'2000', 0.0072721486656678975)
(u'scenario', 0.0072721486656678975)
(u'engineered', 0.0072721486656678975)
(u'includes', 0.0072721486656678975)
(u'several', 0.0072721486656678975)
(u'genetically', 0.0072721486656678975)
(u'actors', 0.0072721486656678975)
(u'concerns', 0.0072721486656678975)
(u'(2004)', 0.0072721486656678975)
(u'fame', 0.0072721486656678975)
(u'13th', 0.0072721486656678975)
(u'escapes', 0.0072721486656678975)
(u'vs', 0.0072721486656678975)
(u'evident', 0.0072721486656678975)
(u'ii', 0.0072721486656678975)
(u'directed', 0.0072721486656678975)
(u'final', 0.0072721486656678975)
(u'two', 0.0072721486656678975)
(u'classic', 0.0072721486656678975)
(u'films)', 0.0072721486656678975)
(u'features', 0.0072721486656678975)
(u'followed', 0.0072721486656678975)
(u'boa', 0.0072721486656678975)
(u'favorite', 0.0072721486656678975)
(u'(best', 0.0072721486656678975)
(u'sequels:', 0.0072721486656678975)
(u'movie', 0.0072721486656678975)
(u'girl', 0.0072721486656678975)
(u'series', 0.0072721486656678975)
(u'like', 0.0072721486656678975)
(u'horror', 0.0072721486656678975)
(u'town', 0.0072721486656678975)
(u'(2002)', 0.0072721486656678975)
(u'role', 0.0072721486656678975)
(u'unleashes', 0.0072721486656678975)
(u'filmed', 0.0072721486656678975)
(u'including', 0.0072721486656678975)
(u'both', 0.0072721486656678975)
(u'typically', 0.006994585739497367)
(u'mating', 0.006994585739497367)
(u'retaining', 0.006994585739497367)
(u'age)', 0.006994585739497367)
(u'fossilization', 0.006994585739497367)
(u'poor', 0.006994585739497367)
(u'when', 0.006994585739497367)
(u'skeletons', 0.006994585739497367)
(u'relatively', 0.006994585739497367)
(u'comparative', 0.006994585739497367)
(u'often', 0.006994585739497367)
(u'112', 0.006994585739497367)
(u'pelvic', 0.006994585739497367)
(u'snakes\u2014have', 0.006994585739497367)
(u'anatomy', 0.006994585739497367)
(u'making', 0.006994585739497367)
(u'grasp', 0.006994585739497367)
(u'girdle', 0.006994585739497367)
(u'94', 0.006994585739497367)
(u'uncommon', 0.006994585739497367)
(u'crown', 0.006994585739497367)
(u'earliest', 0.006994585739497367)
(u'families', 0.006994585739497367)
(u'modern', 0.006994585739497367)
(u'digits', 0.006994585739497367)
(u'tiny', 0.006994585739497367)
(u'lizards', 0.006994585739497367)
(u'readily', 0.006994585739497367)
(u'simoliophiids', 0.006994585739497367)
(u'period', 0.006994585739497367)
(u'dated', 0.006994585739497367)
(u'descended', 0.006994585739497367)
(u'appearing', 0.006994585739497367)
(u'spurs', 0.006994585739497367)
(u'identifiable', 0.006994585739497367)
(u'groups', 0.006994585739497367)
(u'boas\u2014primitive', 0.006994585739497367)
(u'(members', 0.006994585739497367)
(u'(though', 0.006994585739497367)
(u'projections', 0.006994585739497367)
(u'vestigial', 0.006994585739497367)
(u'clawed', 0.006994585739497367)
(u'terrasanctus', 0.006994585739497367)
(u'used', 0.006994585739497367)
(u'marine', 0.006994585739497367)
(u'because', 0.006994585739497367)
(u'anal', 0.006994585739497367)
(u'consensus', 0.006994585739497367)
(u'fragile', 0.006994585739497367)
(u'visible', 0.006994585739497367)
(u'possess', 0.006994585739497367)
(u'between', 0.006994585739497367)
(u'late', 0.006994585739497367)
(u'true', 0.006994585739497367)
(u'million', 0.006994585739497367)
(u'pythons', 0.006994585739497367)
(u'horny', 0.006994585739497367)
(u'oldest', 0.006994585739497367)
(u'limbs)', 0.006994585739497367)
(u'years', 0.006994585739497367)
(u'limbs:', 0.006994585739497367)
(u'remnants', 0.006994585739497367)
(u'introduced', 0.0060812574257856035)
(u'itself', 0.0060812574257856035)
(u'refer', 0.0060812574257856035)
(u'some', 0.0060812574257856035)
(u'there', 0.0060812574257856035)
(u'from', 0.0060812574257856035)
(u'made', 0.005939832834488264)
(u'was', 0.005939832834488264)
(u'by', 0.005939832834488264)
(u'year', 0.005939832834488264)
(u'up', 0.00429122214065788)
(u'los', 0.004054171617190403)
(u'cult', 0.004054171617190403)
(u'that', 0.0038994322424884785)
(u'come', 0.0038994322424884785)
(u'among', 0.0038994322424884785)
(u'first', 0.0038994322424884785)
(u'small', 0.0038994322424884785)
(u'known', 0.003406771775789462)
(u'have', 0.0026564708489786874)
(u'ever', 0.002594692457141974)
(u'his', 0.0017709805659857918)
(u'also', 0.001703385887894731)
(u'as', 0.0)
(u'it', 0.0)
(u'are', 0.0)
(u'now', 0.0)
(u'of', -0.004340989447475109)
(u'is', -0.0065114841712126635)
(u'and', -0.008681978894950218)
(u'or', -0.008681978894950218)
(u'on', -0.012525908482027415)
(u'in', -0.013022968342425327)
(u'to', -0.013022968342425327)
(u'an', -0.015193463066162882)
(u'v', -0.019080162920297575)
(u'p', -0.021037102706994763)
(u'the', -0.028216431408588212)
(u'l', -0.03906890502727598)
(u'i', -0.05609894055198603)
(u'a', -0.060773852264651526)



Assim temos uma métrica de relevância para termos contidos em nosso corpus!



Podemos também rapidamente escrever uma função geradora de n-grams


In [95]:
input_list = ['all', 'this', 'happened', 'more', 'or', 'less']

def find_ngrams(input_list, n):
  return zip(*[input_list[i:] for i in range(n)])

print find_ngrams(input_list, 2)


[(u'all', u'this'), (u'this', u'happened'), (u'happened', u'more'), (u'more', u'or'), (u'or', u'less')]

In [ ]: