In [5]:
'''
Created on Oct 27, 2016

@author: svanhmic
'''

import re
import os 
import fileinput
import gzip
import csv
import sys
from xml.dom import expatbuilder


CSVFILES = "/home/svanhmic/workspace/Python/Erhvervs/data/regnskabsdata/testcsv"
NEWPATH = "/home/svanhmic/workspace/Python/Erhvervs/data/regnskabsdata/testXML"
       
def getSecondContexts(path):
    """ 
    First atempt to make a parser for unit and context references. 
    
    Input
        path: The path to the diretory where the files are stored. 
    
    Output
        dokArr: A dictionary with contextsrefs as index and values as the translation
    """
    files = os.listdir(path)
    contextRe = re.compile(r"<\w*:*context id.*>.*</.*context>",flags=re.MULTILINE)
    contextStart = re.compile(r"<(\w+:|\w*)context id.*>",flags=re.MULTILINE)
    contextEnd = re.compile(r"</[\w\W]*context>")

    dokArr = {}
    for file in files:
        doesPrint = False
        dokString = "<xbrl>"
        with open(path+"/"+file) as f:
            for line in f:
                start = contextStart.search(line)
                end = contextEnd.search(line)
                conRe = contextRe.findall(line)
                if start and not end:
                    doesPrint = True
                    #print("start: "+str(start.group()))
                elif conRe:
                    dokString += conRe[0]
                    #print("conre: "+dokString)
                    continue
                if doesPrint:
                    if start:
                        #print("start: "+str(start.group()))
                        dokString += start.group()
                    else:
                        dokString += line
                if not start and end:
                    doesPrint = False
                    #print("end: "+str(end.group()))
        dokArr[file] = re.sub("[\n\s\t]+"," ", dokString+"</xbrl>",flags=re.MULTILINE )
    #print(dokString)
    return dokArr

def getContextRef(docPath):
    """ 
    Extrats context references fra an xml document, and returns a dictionary with contexts
    
    Input
        docPath: The path to the diretory where the files are stored. 
    
    Output
        contexDic: A dictionary with contextsrefs as index and values as the translation
    """
    
    try:
        xbrlDok = expatbuilder.parse(docPath, False)
        #print(xbrlDok)
        #print(xbrlDok.documentElement)
        pref =""
        if xbrlDok.documentElement.tagName != xbrlDok.documentElement.localName:
            pref = str(re.match("\w+:", xbrlDok.documentElement.tagName).group())
              
        contexts = xbrlDok.documentElement.getElementsByTagName(pref+"context")
        contextDic = {}
        for i in contexts:
            contextDic[i.getAttribute("id")] = i.getElementsByTagName(pref+"identifier")[0].firstChild.nodeValue
            #print(i.getAttribute("id"))
            #print(i.getElementsByTagName(pref+"identifier")[0].firstChild.nodeValue)
        #print(contexts)
        return contextDic
    except:
        print("Well this is embarresing")
        return None
    
def getUnitRef(docPath):
    """ 
    Extrats unit references fra an xml document, and returns a dictionary with contexts
    
    Input
        docPath: The path to the diretory where the files are stored. 
    
    Output
        unitDic: A dictionary with contextsrefs as index and values as the translation
    """
    
    try:
        unitDic = {}
        xbrlDok = expatbuilder.parse(docPath, False)
        pref=""
        if xbrlDok.documentElement.tagName != xbrlDok.documentElement.localName:
            pref = str(re.match("\w+:", xbrlDok.documentElement.tagName).group())
        units = xbrlDok.documentElement.getElementsByTagName(pref+"unit")
        for i in units:
            unitDic[i.getAttribute("id")] = i.getElementsByTagName(pref+"measure")[0].firstChild.nodeValue
        return unitDic
    except:
        print("Well this is embarresing")
        return None
    
def replaceUnitsAndContexts(docPath,csvPath):
    """ 
    Transforms a context- and unit-references in csv file, such the real units and contexts are saved in the csv-file. 
    
    Input
        docPath: The path to the directory where the xml-files are stored. 
        csvPath: The path to the directory where the csvfiles are stored.
    
    Output
        contexDic: A dictionary with context refs as index and values as the translation
    """
    
    unitDict = getUnitRef(docPath) # Get unit references for document
    contextDict = getContextRef(docPath) # Get Context references for document
    #print(contextDict)
    try:
        newRows = []
        fieldNames = [] 
        with open(csvPath) as csvfile:
            file = csv.DictReader(csvfile)
            fieldNames = [re.sub("\\ufeff","",fieldName) for fieldName in file.fieldnames]
            #fieldNames = file.fieldnames
            print(fieldNames)
            for row in file:
                if row["unitRef"] != "":
                    if row["unitRef"] == None:
                        print(row["unitRef"])
                    row["unitRef"] = unitDict[row["unitRef"]]      
                if row["contextRef"] != "":
                    if row["contextRef"] == None:
                        print(row["contextRef"])
                    row["contextRef"] = contextDict[row["contextRef"]]
                if None in row.keys():
                    #print(row.keys())
                    secondDim = [row["Dimensions"],row[None][0]]
                    #print(secondDim)
                    row["Dimensions"] = secondDim
                    del row[None]
                newRows.append(row)
        with open(csvPath,"w+") as outputcsv:
            outputFile = csv.DictWriter(outputcsv,fieldnames=fieldNames)
            outputFile.writeheader()
            outputFile.writerows(newRows)
    except OSError:
        print("the file was not found")
    except KeyError:
        print("the file is already processed")
        
        
def postProcessing(docPath,csvPath,checkFile):
    """ 
    Wrapper for replaceUnitsAndContexts such that a naive "version" control can be made and parallel processing can be initiated.
    
    Input
        docPath: The path to the directory where the xml-files are stored. 
        csvPath: The path to the directory where the csvfiles are stored.
        checkFile: A file that monitors if a file has been updated with context and unitRefs
    
    Output
    """
    with open(checkFile,'w+') as checkList:
        checkDict = {}
        for line in checkList.read():
            print(line)
            
    replaceUnitsAndContexts(NEWPATH+"/2011-09-2733961871.xml",CSVFILES+"/2011-09-2733961871.xml.csv")

In [7]:
if __name__ == '__main__': 
    files = os.listdir(NEWPATH)
    replaceUnitsAndContexts(NEWPATH+"/2011-09-2733961871.xml",CSVFILES+"/2011-09-2733961871.xml.csv")


['Name', 'Dec', 'Prec', 'Lang', 'unitRef', 'contextRef', 'EntityIdentifier', 'Start', 'End/Instant', 'Value', 'Dimensions']

In [ ]: