In [379]:
import nltk
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
Out[379]:
In [380]:
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
In [381]:
wnl = WordNetLemmatizer()
In [382]:
# Lemmatizer demo
print wnl.lemmatize('left')
print wnl.lemmatize('left',pos='v')
print wnl.lemmatize('absolutely')
In [383]:
# refers to http://linusp.github.io/2016/01/21/lemmatization-survey.html
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return None
In [384]:
def lemmatize_sentence(sentence):
res = []
lemmatizer = WordNetLemmatizer()
for word, pos in pos_tag(word_tokenize(sentence)):
aWord = []
# original word
aWord.append(word)
# Part of Class
aWord.append(pos)
# stemmedWord
wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
stemmedWord = lemmatizer.lemmatize(word, pos=wordnet_pos)
aWord.append(stemmedWord)
# lowerCasedStemmedWord
lowerCasedStemmedWord = stemmedWord.lower()
aWord.append(lowerCasedStemmedWord)
# save content
res.append(aWord)
return res
In [610]:
# connect to database
import pymysql
connection = pymysql.connect(host='140.116.112.164',
user='iim_project',
password='1qaz2wsx3eDC',
db='words_pos',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
cursor = connection.cursor()
cursor.execute('use words_pos')
Out[610]:
In [652]:
# CHANGE THE FILE NAME HERE!!!!!!!!!!!!!!
fileRoot = 'ES2002a.D.words'
In [653]:
f = open('../AMI Parsing/data/wordsCombined_txt/' + fileRoot + '.txt', 'r+')
In [654]:
text = f.read()
In [655]:
# refers to http://www.nltk.org/book/ch05.html
result = lemmatize_sentence(text)
In [656]:
fileTableName = fileRoot + '_POS'
print fileTableName
In [657]:
sql_create = "create table `%s` (id int(11) NOT NULL AUTO_INCREMENT, word char(16), wordOfClass char(11), stemmedWord char(11), lowerCasedStemmedWord char(16), wordPosition char(16), owner char(16), PRIMARY KEY(id)) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1"
print sql_create
In [658]:
cursor.execute(sql_create, fileTableName)
connection.commit()
In [659]:
sql_insert = "insert into `%s` (word, wordOfClass, stemmedWord, lowerCasedStemmedWord, owner) values(%s, %s, %s, %s, %s)"
print sql_insert
In [660]:
for aWord in result:
cursor.execute(sql_insert, (fileTableName, aWord[0], aWord[1], aWord[2], aWord[3], fileRoot))
connection.commit()
In [661]:
f.close()
In [ ]: