Just exploring WordNet
In [1]:
from nltk.corpus import wordnet as wn
import re
wnword = wn.synsets('computer')[0]
print wnword
In [3]:
wndef = wnword.definition()
print(wndef)
In [4]:
wndefList = re.sub("[^\w]", " ",wndef).split()
print wndefList
In [5]:
machineWord = wn.synsets(wndefList[1])[0].definition()
print machineWord
In [6]:
synOffset = wn.synset('dogshit.n.01').offset()
#synOffsetFilled = str(ss).zfill(8)
print synOffset
In [7]:
syns = list(wn.all_synsets())
offsets_list = [(s.offset(), s) for s in syns]
offsets_dict = dict(offsets_list)
In [8]:
print offsets_dict[6611376]
Now, we read in the computer science keywords from a file. Then, convert old ID to new ID and word. This is the code to just play with in the notebook.
In [9]:
csWordsIndex = 1 #line number to read
csWords = open('decode_wordnet/csWordnetWordsNouns.txt')
csWordsLineData = csWords.readlines()
print len(csWordsLineData)
idIndex = csWordsLineData[csWordsIndex].find(':')
idToConvert = csWordsLineData[csWordsIndex][idIndex+2:idIndex+10]
print 'old ID number: ' + idToConvert
convertIdHash = open('decode_wordnet/wn16-30noun.txt')
convertIdHashData = convertIdHash.read()
newIdIndex = convertIdHashData.find(idToConvert)
newId = convertIdHashData[newIdIndex+9:newIdIndex+17]
print 'new ID number: ' + newId
newCsWord = offsets_dict[int(newId)]
print newCsWord
This code generates a file containing the old ID (WordNet 1.6) , new ID (WordNet 3.0), and Synset. Then, it puts it into a file readable by Excel.
In [10]:
from nltk.corpus import wordnet as wn
import re
#open files and read into variables
csWordnetWordsNounsOutput = open('decode_wordnet/csWordnetWordsNounsOutput.TXT','w') #the output file
csWords = open('decode_wordnet/csWordnetWordsNouns.txt') #the input file
csWordsLineData = csWords.readlines()
convertIdHash = open('decode_wordnet/wn16-30noun.txt') #the hash table between wordnet versions
convertIdHashData = convertIdHash.read()
#hash index and lookup word and write into file
for csWordsIndex in range(0,len(csWordsLineData)-1): #make sure read every line in file
idIndex = csWordsLineData[csWordsIndex].find(':')
idToConvert = csWordsLineData[csWordsIndex][idIndex+2:idIndex+10] #find id to convert
try: #there is some problem in reading newId into the offsets_dict
newIdIndex = convertIdHashData.find(idToConvert)
newId = convertIdHashData[newIdIndex+9:newIdIndex+17]
newCsWord = offsets_dict[int(newId)]
newCsWordDef = newCsWord.definition()
#write to file:
csWordnetWordsNounsOutput.write(str(idToConvert) + '\t' + str(newId) + '\t' + str(newCsWord) + '\t' + newCsWordDef + '\n')
except:
pass
csWordnetWordsNounsOutput.close()
Exploring requests module
In [11]:
import requests
r = requests.get('http://aione.tritera.com', auth=('itay.livni@tapnotion.com', 'WZYZpN5o'))
print r.status_code
print r.headers['content-type']
#r.text
Wikipedia API - getting data from wikipedia using the module "wikipedia" is really easy!
In [14]:
import wikipedia
wikiReturnText = wikipedia.summary("Vitual reality", sentences=1)
print wikiReturnText
Exploring mechanize module
In [80]:
import re
import mechanize
URL = "http://aione.tritera.com/gui/goto.main.php#"
br = mechanize.Browser()
br.open(URL)
request = mechanize.Request(URL)
print br.title()
print request
This code grabs definition from wikipedia and inputs it into ai-One using the Selenium module:
In [17]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import wikipedia
wikiReturnText = wikipedia.summary("computer", sentences=1) #input keyword
driver = webdriver.Firefox() #use Firefox
driver.get("http://aione.tritera.com/gui/goto.main.php#") #go to website
elem = driver.find_element_by_link_text('Source') #find Source tab and store to element elem
elem.click() #click Source tab
elem2 = driver.find_element_by_id('sourceText') #find text element
elem2.clear() #clear the contents of the textbox
elem2.send_keys(wikiReturnText) #input definition
driver.find_element_by_id("btn_find_all").click() #click
driver.implicitly_wait(10) #wait 10 sec for page to load results
aiOneKeywordList = [] #initialize/clear keyword list
try:
for i in range(0,20): #get up to 20 keywords (expect less)
try:
value1 = driver.find_element_by_id(".word"+str(i)+". ").get_attribute("value")
#print value1
aiOneKeywordList = aiOneKeywordList + [str(value1)]
except: #exit when no more keywords
pass #don't handle exception that element doesn't exist
break #it was hanging here, so break was necessary
except:
pass
print aiOneKeywordList
driver.close()
This code grabs defs from wordnet and puts them into aiOne
In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import re
aiOneKeywords = open('decode_wordnet/aiOneKeywords2.TXT','w')
wordnetDefs = open('decode_wordnet/csWordnetWordsNounsOutput.TXT','r')
wordnetDefsLineData = wordnetDefs.readlines()
wordnetDefsList = []
for i in range(0,len(wordnetDefsLineData)-1):
wordnetDefString = wordnetDefsLineData[i].split('\t')[3]
if wordnetDefString.startswith( '(computer science)' ):
wordnetDefString = wordnetDefString[19:]
wordnetDefsList = wordnetDefsList + [wordnetDefString]
driver = webdriver.Firefox() #use Firefox
driver.get("http://aione.tritera.com/gui/goto.main.php#") #go to website
for k in range(0,len(wordnetDefsLineData)-1):
elem = driver.find_element_by_link_text('Source') #find Source tab and store to element elem
elem.click() #click Source tab
elem2 = driver.find_element_by_id('sourceText') #find text element
elem2.clear() #clear the contents of the textbox
elem2.send_keys(wordnetDefsList[k]) #input definition
driver.find_element_by_id("btn_find_all").click() #click
driver.implicitly_wait(10) #wait 10 sec for page to load results
aiOneKeywordList = [] #initialize/clear keyword list
for i in range(0,20): #get up to 20 keywords (expect less)
try:
value1 = driver.find_element_by_id(".word"+str(i)+". ").get_attribute("value")
#print value1
aiOneKeywordList = aiOneKeywordList + [str(value1)]
except: #exit when no more keywords
pass #don't handle exception that element doesn't exist
break #it was hanging here, so break was necessary
aiOneKeywords.write(wordnetDefsLineData[k].split('\t')[2] + '\t' + str(aiOneKeywordList) + '\n')
driver.close()
In [28]:
from nltk.corpus import wordnet as wn
import re
wnword = []
for i in range(0,20): #print every definition possible for the word
try:
keyword = 'allocation'
keyword = keyword.replace(' ','_') #wordnet needs underscores not spaces
wnword = wnword + [wn.synsets(keyword)[i].definition()]
except:
pass
print wnword
Trying to pull everything together. Takes in a word and generates associated keywords and definitions.
In [6]:
#takes a keyword in and generates two files: a keyword and generated keywords; and a keyword definition file
import traceback
import sys
import re
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from nltk.corpus import wordnet as wn
masterKeywordFile = open('decode_wordnet/masterKeywordFile.TXT','w') #keyword definition file
keywordTreeFile = open('decode_wordnet/keywordTreeFile.TXT','w') #keyword generation file
masterKeywordList = []
masterKeywordList = masterKeywordList + ['allocation'] #initial word, in this case 'allocation'
driver = webdriver.Firefox() #use Firefox
driver.get("http://aione.tritera.com/gui/goto.main.php#") #go to website
time.sleep(10) #wait 10 sec for browser to load
for masterKeywordListIndex in range(0,201): #do this for 200 words in masterKeywordList
wordnetDefs = []
for i in range(0,20): #print every definition possible for the word in master keyword file (expect less than 20 definitions)
try: #replaces spaces with underscores
keyword = masterKeywordList[masterKeywordListIndex]
keyword = keyword.replace(' ','_') #wordnet needs underscores not spaces
wordnetDefs = wordnetDefs + [wn.synsets(keyword)[i].definition()] #wordnet lookup
masterKeywordFile.write( keyword + '\t' + str(wordnetDefs[i]) +'\n')
except:
pass
for k in range(0,len(wordnetDefs)): #do this for each wordnet definition of the keyword
if 'computer science' in wordnetDefs[k]: #takes (computer science) out of string
wordnetDefs[k] = wordnetDefs[k][19:] #starts 19 characters after first paranthesis in (computer sci...
elem = driver.find_element_by_link_text('Source') #find Source tab and store to element elem
elem.click() #click Source tab
time.sleep(1) #give some time after click
elem2 = driver.find_element_by_id('sourceText') #find text element
elem2.clear() #clear the contents of the textbox
elem2.send_keys(wordnetDefs[k]) #input definition
driver.find_element_by_id("btn_find_all").click() #click
time.sleep(10) #wait 10 sec for page to load results
for i in range(0,20): #get up to 20 keywords (expect less)
try:
value1 = driver.find_element_by_id('.word'+str(i)+'. ').get_attribute("value")
time.sleep(5)
if str(value1) in masterKeywordList:
pass
else: #store keyword if returned keyword not in master keyword list
masterKeywordList = masterKeywordList + [str(value1)]
keywordTreeFile.write(keyword + '\t' + str(value1) + '\n')
except Exception, err:
#print traceback.format_exc()
break #it was hanging here, so break was necessary
masterKeywordFile.close()
keywordTreeFile.close()
driver.close()
In [21]:
In [ ]: