notebook.community

Edit and run



In [60]:

    
#coding=utf-8
# Topic Parser
# Description:
#    而每一份中會有數個會議檔案，用小寫編號 eg. ES2002a, ES2002b...
#    每個檔案中會有數個人講話，用大寫編號 eg. A, B, C...
# Usage:
#    輸入欲分析的檔案，一直按指令即可



In [61]:

    
import xml.etree.ElementTree as ET



In [130]:

    
filePrefix = 'ES2015b'



In [131]:

    
fileRoot = filePrefix + '.topic'



In [132]:

    
fileName= '../AMI_corpus_resources/topics/' + fileRoot + '.xml'



In [133]:

    
tree = ET.parse(fileName)









    



---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-133-d1dcd6c4118d> in <module>()
----> 1 tree = ET.parse(fileName)

/Users/makris/anaconda/lib/python2.7/xml/etree/ElementTree.pyc in parse(source, parser)
   1180 def parse(source, parser=None):
   1181     tree = ElementTree()
-> 1182     tree.parse(source, parser)
   1183     return tree
   1184 

/Users/makris/anaconda/lib/python2.7/xml/etree/ElementTree.pyc in parse(self, source, parser)
    645         close_source = False
    646         if not hasattr(source, "read"):
--> 647             source = open(source, "rb")
    648             close_source = True
    649         try:

IOError: [Errno 2] No such file or directory: '../AMI_corpus_resources/topics/ES2015b.topic.xml'



In [134]:

    
root = tree.getroot()



In [135]:

    
idName = '{http://nite.sourceforge.net/}id'



In [136]:

    
import re # regrssion expression



In [137]:

    
# find complete matched content
def findTotalWord(totalContent):
    pattern = 'id\((\w+\.\w\.\w+)\)'
    matched = re.findall(pattern, totalContent)
    return matched



In [121]:

    
# print word indexes
def getWordIndex(matched):
    wordList = []
    if len(matched) == 1:
        wordList.append(matched[0])
    else:
        # parse the word index
        start = matched[0]
        end = matched[1]
        pattern = '\d+'
        startIndexes = re.findall(pattern, start)
        endIndexes = re.findall(pattern, end)
        startWordIndex = startIndexes[1]
        endWordIndex = endIndexes[1]

        # find the prefix # eg. ES2002a.B.words
        prefixPattern = '\D+\d+\D\.\D+\.\D+'
        prefix = re.match(prefixPattern, start)
        prefixStr = prefix.group(0)
        
        # conbine the prefix and the word index
        for index in range(int(startWordIndex), int(endWordIndex) + 1):
             wordList.append(prefixStr + str(index))
    return wordList



In [122]:

    
# content
totalContents = []
for topic in root.iter('topic'):
    aTopicDic = {}
    
    topicDescription = topic.get('other_description')
    topicID = topic.get(idName)  # get: find attribute's content
    
    # check the description and ID exists
    if topicDescription:
        aTopicDic['topicDescription'] = topicDescription
    else:
        aTopicDic['topicDescription'] = "No Topic"
    if topicID:
        aTopicDic['topicID'] = topicID
    else:
        aTopicDic['topicID'] = "No TopicID"
    
    # join matched indexes of words
    matchWordsIndexes = []
    for child in topic.iter('{http://nite.sourceforge.net/}child'):
        href = child.get('href')
        totalWordStr = findTotalWord(href)
        
        tempWordIndexes = getWordIndex(totalWordStr)
        for wordIndex in range(len(tempWordIndexes)):
            matchWordsIndexes.append(tempWordIndexes[wordIndex])
        
    # store the match words indexes to aTopicDic
    aTopicDic['wordsIndexes'] = matchWordsIndexes
    
    # store retrieved dict to array
    totalContents.append(aTopicDic)



In [123]:

    
print totalContents[0]









    



{'topicDescription': 'No Topic', 'wordsIndexes': ['ES2015a.A.words0', 'ES2015a.A.words1', 'ES2015a.A.words2', 'ES2015a.A.words3', 'ES2015a.A.words4', 'ES2015a.A.words5', 'ES2015a.A.words6', 'ES2015a.A.words7', 'ES2015a.A.words8', 'ES2015a.A.words9', 'ES2015a.A.words10', 'ES2015a.A.words11', 'ES2015a.A.words12', 'ES2015a.A.words13', 'ES2015a.A.words14', 'ES2015a.A.words15', 'ES2015a.A.words16', 'ES2015a.A.words17', 'ES2015a.A.words18', 'ES2015a.A.words19', 'ES2015a.A.words20', 'ES2015a.A.words21', 'ES2015a.A.words22', 'ES2015a.A.words23', 'ES2015a.A.words24', 'ES2015a.A.words25', 'ES2015a.A.words26', 'ES2015a.A.words27', 'ES2015a.D.words0', 'ES2015a.D.words1', 'ES2015a.D.words2', 'ES2015a.D.words3', 'ES2015a.D.words4', 'ES2015a.D.words5', 'ES2015a.D.words6', 'ES2015a.D.words7', 'ES2015a.D.words8', 'ES2015a.D.words9', 'ES2015a.D.words10', 'ES2015a.D.words11', 'ES2015a.D.words12', 'ES2015a.D.words13', 'ES2015a.D.words14', 'ES2015a.D.words15', 'ES2015a.A.words28', 'ES2015a.A.words29', 'ES2015a.A.words30', 'ES2015a.A.words31', 'ES2015a.A.words32', 'ES2015a.A.words33', 'ES2015a.A.words34', 'ES2015a.A.words35', 'ES2015a.A.words36', 'ES2015a.A.words37', 'ES2015a.A.words38', 'ES2015a.A.words39', 'ES2015a.C.words0', 'ES2015a.B.words0', 'ES2015a.B.words1', 'ES2015a.B.words2', 'ES2015a.D.words16', 'ES2015a.A.words40', 'ES2015a.A.words41', 'ES2015a.A.words42', 'ES2015a.A.words43', 'ES2015a.D.words17', 'ES2015a.A.words44', 'ES2015a.A.words45', 'ES2015a.A.words46', 'ES2015a.A.words47', 'ES2015a.A.words48', 'ES2015a.A.words49', 'ES2015a.A.words50', 'ES2015a.A.words51', 'ES2015a.A.words52', 'ES2015a.A.words53', 'ES2015a.A.words54', 'ES2015a.A.words55', 'ES2015a.A.words56', 'ES2015a.A.words57', 'ES2015a.A.words58', 'ES2015a.A.words59', 'ES2015a.A.words60', 'ES2015a.A.words61', 'ES2015a.A.words62', 'ES2015a.A.words63', 'ES2015a.A.words64', 'ES2015a.A.words65', 'ES2015a.A.words66', 'ES2015a.A.words67', 'ES2015a.A.words68', 'ES2015a.A.words69', 'ES2015a.A.words70', 'ES2015a.A.words71', 'ES2015a.A.words72', 'ES2015a.A.words73', 'ES2015a.A.words74', 'ES2015a.A.words75', 'ES2015a.A.words76', 'ES2015a.A.words77', 'ES2015a.A.words78', 'ES2015a.A.words79', 'ES2015a.A.words80', 'ES2015a.A.words81', 'ES2015a.B.words3', 'ES2015a.B.words4', 'ES2015a.B.words5', 'ES2015a.C.words1', 'ES2015a.B.words6', 'ES2015a.B.words7', 'ES2015a.B.words8'], 'topicID': 'ES2015a.topic.elana.3'}



In [124]:

    
aTopic = totalContents[0]
aTopic['topicDescription']









    Out[124]:





'No Topic'



In [125]:

    
# connect to database
import pymysql
connection = pymysql.connect(host='140.116.112.164',
                             user='iim_project',
                             password='1qaz2wsx3eDC',
                             db='topic_to_what_words',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

cursor = connection.cursor()



In [126]:

    
# create table
sql_create = "create table `%s` (id int(11) NOT NULL AUTO_INCREMENT, topic varchar(255), relatedWordPos varchar(255), topicID varchar(255), PRIMARY KEY(id)) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1" 
print sql_create









    



create table `%s` (id int(11) NOT NULL AUTO_INCREMENT, topic varchar(255), relatedWordPos varchar(255), topicID varchar(255), PRIMARY KEY(id)) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin AUTO_INCREMENT=1



In [127]:

    
fileTableName = fileRoot + '_to_words'
print fileTableName









    



ES2015a.topic_to_words



In [128]:

    
cursor.execute(sql_create, fileTableName)
connection.commit()



In [129]:

    
sql_insert = "insert into `%s` (topic, relatedWordPos, topicID) values(%s, %s, %s)"

for index in range(len(totalContents)):
    aTopicDic = totalContents[index]
    
    topicDescription = aTopicDic['topicDescription']
    topicID = aTopicDic['topicID']
    wordsInTopic = aTopicDic['wordsIndexes']
    for wordPos in wordsInTopic:
        cursor.execute(sql_insert,(fileTableName, topicDescription, wordPos, topicID))
        
connection.commit()



In [ ]:



In [ ]: