In [84]:
def freq(word, doc):
return doc.count(word)
def word_count(doc):
return len(doc.split())
def tf(word, doc):
return ( freq(word, doc) / float(word_count(doc)) )
def num_docs_containing(word, list_of_docs):
count = 0
for document in list_of_docs:
if freq(word, document) > 0:
count += 1
return 1 + count
def idf(word, list_of_docs):
num = len(list_of_docs)
#print num
den = float( num_docs_containing(word, list_of_docs) )
#print den
return math.log( num / den )
In [91]:
d0 = """Python is a 2000 made-for-TV horror movie directed by Richard
Clabaugh. The film features several cult favorite actors, including William
Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy,
Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the
A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean
Whalen. The film concerns a genetically engineered snake, a python, that
escapes and unleashes itself on a small town. It includes the classic final
girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles,
California and Malibu, California. Python was followed by two sequels: Python
II (2002) and Boa vs. Python (2004), both also made-for-TV films."""
d1 = """Python is a genus of
nonvenomous pythons found in Africa and Asia. Currently, 7 species are
recognised. A member of this genus, P. reticulatus, is among the longest
snakes known. This snake is also very scary and I never want to see one up close.
Which is scariest, pythons, sharks or ninjas?"""
d2 = """The Colt Python is a .357 Magnum caliber revolver formerly
manufactured by Colt's Manufacturing Company of Hartford, Connecticut.
It is sometimes referred to as a "Combat Magnum". It was first introduced
in 1955, the same year as Smith & Wesson's M29 .44 Magnum. The now discontinued
Colt Python targeted the premium revolver market segment. Some firearm
collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy
Thompson, Renee Smeets and Martin Dougherty have described the Python as the
finest production revolver ever made."""
#d3 = """I am a truly small text"""
d3 = """The fossil record of snakes is relatively poor because snake skeletons
are typically small and fragile making fossilization uncommon. Fossils readily
identifiable as snakes (though often retaining hind limbs) first appear in the
fossil record during the Cretaceous period. The earliest known true snake
fossils (members of the crown group Serpentes) come from the marine simoliophiids,
the oldest of which is the Late Cretaceous (Cenomanian age) Haasiophis terrasanctus,
dated to between 112 and 94 million years old. Based on comparative anatomy, there
is consensus that snakes descended from lizards. Pythons and boas—primitive
groups among modern snakes—have vestigial hind limbs: tiny, clawed digits known as anal
spurs, which are used to grasp during mating. The families Leptotyphlopidae
and Typhlopidae also possess remnants of the pelvic girdle, appearing as horny projections
when visible."""
d4 = """The potato is a starchy, tuberous crop from the perennial nightshade
Solanum tuberosum L. The word "potato" may refer either to the plant itself
or to the edible tuber. In the Andes, where the species is indigenous, there
are some other closely related cultivated potato species. Potatoes were introduced
outside the Andes region approximately four centuries ago, and have since
become an integral part of much of the world's food supply. It is the world's
fourth-largest food crop, following maize, wheat, and rice."""
corpus = [d0,d1,d2,d3,d4]
def all_freq(word):
for i,doc in enumerate(corpus):
print 'document',i, 'has the word "'+word+'":\t', freq(word,doc),'times out of',word_count(doc)
def all_tf(word):
for i,doc in enumerate(corpus):
print 'document',i, 'has TF for word "'+word+'":\t', tf(word,doc)
In [92]:
word = 'Python'
#all_freq(word)
print ''
all_tf(word)
print ''
print word,'idf 3 docs:\t',idf(word,corpus[:3]),'\tidf all docs:',idf(word,corpus)
print '\n'
word = 'the'
#all_freq(word)
print ''
all_tf(word)
print ''
print word,'idf 3 docs:\t',idf(word,corpus[:3]),'\tidf all docs:',idf(word,corpus)
print '\n'
word = 'truly'
#all_freq(word)
print ''
all_tf(word)
print ''
print word,'idf 3 docs:\t',idf(word,corpus[:3]),'\tidf all docs:',idf(word,corpus)
print '\n'
In [93]:
def treat(doc):
doc = doc.lower()
doc = doc.replace('.','')
doc = doc.replace('-', ' ')
doc = doc.replace(',','')
doc = doc.replace('?','')
return doc
In [94]:
from operator import itemgetter
res = {}
for doc in corpus:
doc = treat(doc)
for word in doc.split():
word_idf = idf(word, corpus)
word_tf = tf(word, doc)
res[word] = word_idf * word_tf
final = sorted(res.items(), key=itemgetter(1))
for xx in final[::-1]:
print xx
Assim temos uma métrica de relevância para termos contidos em nosso corpus!
Podemos também rapidamente escrever uma função geradora de n-grams
In [95]:
input_list = ['all', 'this', 'happened', 'more', 'or', 'less']
def find_ngrams(input_list, n):
return zip(*[input_list[i:] for i in range(n)])
print find_ngrams(input_list, 2)
In [ ]: