notebook.community

Edit and run



In [1]:

    
import nltk

nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")









    



[nltk_data] Downloading package wordnet to /Users/makris/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/makris/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!






    Out[1]:





True



In [2]:

    
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer



In [3]:

    
wnl = WordNetLemmatizer()



In [4]:

    
# Lemmatizer demo
print wnl.lemmatize('left')
print wnl.lemmatize('left',pos='v')
print wnl.lemmatize('absolutely')









    



left
leave
absolutely



In [125]:

    
# refers to http://linusp.github.io/2016/01/21/lemmatization-survey.html
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None



In [360]:

    
def lemmatize_sentence(sentence):
    res = []
    lemmatizer = WordNetLemmatizer()
    for word, pos in pos_tag(word_tokenize(sentence)):
        aWord = []
        
        # original word
        aWord.append(word)
        
        # Part of Class
        aWord.append(pos)
        
        # stemmedWord
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        stemmedWord = lemmatizer.lemmatize(word, pos=wordnet_pos)
        aWord.append(stemmedWord)
        
        # lowerCasedStemmedWord
        lowerCasedStemmedWord = stemmedWord.lower()
        aWord.append(lowerCasedStemmedWord)
        
        # save content
        res.append(aWord)

    return res



In [361]:

    
# connect to database
import pymysql
connection = pymysql.connect(host='140.116.112.164',
                             user='iim_project',
                             password='1qaz2wsx3eDC',
                             db='iim_project',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

cursor = connection.cursor()
cursor.execute('use iim_project')









    Out[361]:





0



In [362]:

    
fileRoot = 'ES2002a.A.words'



In [363]:

    
f = open('../AMI Parsing/data/wordsCombined_txt/' + fileRoot + '.txt', 'r+')



In [364]:

    
text = f.read()



In [365]:

    
# refers to http://www.nltk.org/book/ch05.html
result = lemmatize_sentence(text)



In [373]:

    
fileTableName = fileRoot + '_POS'
print fileTableName









    



ES2002a.A.words_POS



In [374]:

    
sql_create = "create table `%s` (id int(11) NOT NULL AUTO_INCREMENT, word char(11), wordOfClass char(11), stemmedWord char(11), lowerCasedStemmedWord char(11), PRIMARY KEY(id)) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1"
print sql_create









    



create table `%s` (id int(11) NOT NULL AUTO_INCREMENT, word char(11), wordOfClass char(11), stemmedWord char(11), lowerCasedStemmedWord char(11), PRIMARY KEY(id)) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1



In [375]:

    
cursor.execute(sql_create, fileTableName)
connection.commit()



In [376]:

    
sql_insert = "insert into `%s` (word, wordOfClass, stemmedWord, lowerCasedStemmedWord) values(%s, %s, %s, %s)"
print sql_insert









    



insert into `%s` (word, wordOfClass, stemmedWord, lowerCasedStemmedWord) values(%s, %s, %s, %s)



In [377]:

    
for aWord in result:
    cursor.execute(sql_insert, (fileTableName, aWord[0], aWord[1], aWord[2], aWord[3]))
connection.commit()









    



/Users/makris/anaconda/lib/python2.7/site-packages/pymysql/cursors.py:297: Warning: Data truncated for column 'lowerCasedStemmedWord' at row 1
  self._do_get_result()



In [378]:

    
f.close()



In [ ]: