Just exploring WordNet
from nltk.corpus import wordnet as wn
import re
wnword = wn.synsets('computer')[0]
print wnword
wndef = wnword.definition()
wndefList = re.sub("[^\w]", " ",wndef).split()
print wndefList
machineWord = wn.synsets(wndefList[1])[0].definition()
print machineWord
synOffset = wn.synset('dogshit.n.01').offset()
#synOffsetFilled = str(ss).zfill(8)
print synOffset
syns = list(wn.all_synsets())
offsets_list = [(s.offset(), s) for s in syns]
offsets_dict = dict(offsets_list)
print offsets_dict[6611376]
Now, we read in the computer science keywords from a file. Then, convert old ID to new ID and word. This is the code to just play with in the notebook.
csWordsIndex = 1 #line number to read
csWords = open('decode_wordnet/csWordnetWordsNouns.txt')
csWordsLineData = csWords.readlines()
print len(csWordsLineData)
idIndex = csWordsLineData[csWordsIndex].find(':')
idToConvert = csWordsLineData[csWordsIndex][idIndex+2:idIndex+10]
print 'old ID number: ' + idToConvert
convertIdHash = open('decode_wordnet/wn16-30noun.txt')
convertIdHashData = convertIdHash.read()
newIdIndex = convertIdHashData.find(idToConvert)
newId = convertIdHashData[newIdIndex+9:newIdIndex+17]
print 'new ID number: ' + newId
newCsWord = offsets_dict[int(newId)]
print newCsWord
This code generates a file containing the old ID (WordNet 1.6) , new ID (WordNet 3.0), and Synset. Then, it puts it into a file readable by Excel.
from nltk.corpus import wordnet as wn
import re
#open files and read into variables
csWordnetWordsNounsOutput = open('decode_wordnet/csWordnetWordsNounsOutput.TXT','w') #the output file
csWords = open('decode_wordnet/csWordnetWordsNouns.txt') #the input file
csWordsLineData = csWords.readlines()
convertIdHash = open('decode_wordnet/wn16-30noun.txt') #the hash table between wordnet versions
convertIdHashData = convertIdHash.read()
#hash index and lookup word and write into file
for csWordsIndex in range(0,len(csWordsLineData)-1): #make sure read every line in file
idIndex = csWordsLineData[csWordsIndex].find(':')
idToConvert = csWordsLineData[csWordsIndex][idIndex+2:idIndex+10] #find id to convert
try: #there is some problem in reading newId into the offsets_dict
newIdIndex = convertIdHashData.find(idToConvert)
newId = convertIdHashData[newIdIndex+9:newIdIndex+17]
newCsWord = offsets_dict[int(newId)]
newCsWordDef = newCsWord.definition()
#write to file:
csWordnetWordsNounsOutput.write(str(idToConvert) + '\t' + str(newId) + '\t' + str(newCsWord) + '\t' + newCsWordDef + '\n')
Exploring requests module
import requests
r = requests.get('http://aione.tritera.com', auth=('itay.livni@tapnotion.com', 'WZYZpN5o'))
print r.status_code
print r.headers['content-type']
Wikipedia API - getting data from wikipedia using the module "wikipedia" is really easy!
import wikipedia
wikiReturnText = wikipedia.summary("Vitual reality", sentences=1)
print wikiReturnText
Exploring mechanize module
import re
import mechanize
URL = "http://aione.tritera.com/gui/goto.main.php#"
br = mechanize.Browser()
request = mechanize.Request(URL)
print br.title()
print request
This code grabs definition from wikipedia and inputs it into ai-One using the Selenium module:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import wikipedia
wikiReturnText = wikipedia.summary("computer", sentences=1) #input keyword
driver = webdriver.Firefox() #use Firefox
driver.get("http://aione.tritera.com/gui/goto.main.php#") #go to website
elem = driver.find_element_by_link_text('Source') #find Source tab and store to element elem
elem.click() #click Source tab
elem2 = driver.find_element_by_id('sourceText') #find text element
elem2.clear() #clear the contents of the textbox
elem2.send_keys(wikiReturnText) #input definition
driver.find_element_by_id("btn_find_all").click() #click
driver.implicitly_wait(10) #wait 10 sec for page to load results
aiOneKeywordList = [] #initialize/clear keyword list
for i in range(0,20): #get up to 20 keywords (expect less)
value1 = driver.find_element_by_id(".word"+str(i)+". ").get_attribute("value")
#print value1
aiOneKeywordList = aiOneKeywordList + [str(value1)]
except: #exit when no more keywords
pass #don't handle exception that element doesn't exist
break #it was hanging here, so break was necessary
print aiOneKeywordList
This code grabs defs from wordnet and puts them into aiOne
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import re
aiOneKeywords = open('decode_wordnet/aiOneKeywords2.TXT','w')
wordnetDefs = open('decode_wordnet/csWordnetWordsNounsOutput.TXT','r')
wordnetDefsLineData = wordnetDefs.readlines()
wordnetDefsList = []
for i in range(0,len(wordnetDefsLineData)-1):
wordnetDefString = wordnetDefsLineData[i].split('\t')[3]
if wordnetDefString.startswith( '(computer science)' ):
wordnetDefString = wordnetDefString[19:]
wordnetDefsList = wordnetDefsList + [wordnetDefString]
driver = webdriver.Firefox() #use Firefox
driver.get("http://aione.tritera.com/gui/goto.main.php#") #go to website
for k in range(0,len(wordnetDefsLineData)-1):
elem = driver.find_element_by_link_text('Source') #find Source tab and store to element elem
elem.click() #click Source tab
elem2 = driver.find_element_by_id('sourceText') #find text element
elem2.clear() #clear the contents of the textbox
elem2.send_keys(wordnetDefsList[k]) #input definition
driver.find_element_by_id("btn_find_all").click() #click
driver.implicitly_wait(10) #wait 10 sec for page to load results
aiOneKeywordList = [] #initialize/clear keyword list
for i in range(0,20): #get up to 20 keywords (expect less)
value1 = driver.find_element_by_id(".word"+str(i)+". ").get_attribute("value")
#print value1
aiOneKeywordList = aiOneKeywordList + [str(value1)]
except: #exit when no more keywords
pass #don't handle exception that element doesn't exist
break #it was hanging here, so break was necessary
aiOneKeywords.write(wordnetDefsLineData[k].split('\t')[2] + '\t' + str(aiOneKeywordList) + '\n')
from nltk.corpus import wordnet as wn
import re
wnword = []
for i in range(0,20): #print every definition possible for the word
keyword = 'allocation'
keyword = keyword.replace(' ','_') #wordnet needs underscores not spaces
wnword = wnword + [wn.synsets(keyword)[i].definition()]
print wnword
Trying to pull everything together. Takes in a word and generates associated keywords and definitions.
#takes a keyword in and generates two files: a keyword and generated keywords; and a keyword definition file
import traceback
import sys
import re
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from nltk.corpus import wordnet as wn
masterKeywordFile = open('decode_wordnet/masterKeywordFile.TXT','w') #keyword definition file
keywordTreeFile = open('decode_wordnet/keywordTreeFile.TXT','w') #keyword generation file
masterKeywordList = []
masterKeywordList = masterKeywordList + ['allocation'] #initial word, in this case 'allocation'
driver = webdriver.Firefox() #use Firefox
driver.get("http://aione.tritera.com/gui/goto.main.php#") #go to website
time.sleep(10) #wait 10 sec for browser to load
for masterKeywordListIndex in range(0,201): #do this for 200 words in masterKeywordList
wordnetDefs = []
for i in range(0,20): #print every definition possible for the word in master keyword file (expect less than 20 definitions)
try: #replaces spaces with underscores
keyword = masterKeywordList[masterKeywordListIndex]
keyword = keyword.replace(' ','_') #wordnet needs underscores not spaces
wordnetDefs = wordnetDefs + [wn.synsets(keyword)[i].definition()] #wordnet lookup
masterKeywordFile.write( keyword + '\t' + str(wordnetDefs[i]) +'\n')
for k in range(0,len(wordnetDefs)): #do this for each wordnet definition of the keyword
if 'computer science' in wordnetDefs[k]: #takes (computer science) out of string
wordnetDefs[k] = wordnetDefs[k][19:] #starts 19 characters after first paranthesis in (computer sci...
elem = driver.find_element_by_link_text('Source') #find Source tab and store to element elem
elem.click() #click Source tab
time.sleep(1) #give some time after click
elem2 = driver.find_element_by_id('sourceText') #find text element
elem2.clear() #clear the contents of the textbox
elem2.send_keys(wordnetDefs[k]) #input definition
driver.find_element_by_id("btn_find_all").click() #click
time.sleep(10) #wait 10 sec for page to load results
for i in range(0,20): #get up to 20 keywords (expect less)
value1 = driver.find_element_by_id('.word'+str(i)+'. ').get_attribute("value")
if str(value1) in masterKeywordList:
else: #store keyword if returned keyword not in master keyword list
masterKeywordList = masterKeywordList + [str(value1)]
keywordTreeFile.write(keyword + '\t' + str(value1) + '\n')
except Exception, err:
#print traceback.format_exc()
break #it was hanging here, so break was necessary
