In [1]:
#Python3
from xml.etree import ElementTree
import os
import urllib.request
import time
import csv

In [2]:
nmsp = {'dtt': 'http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.4', 'are': 'http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_answer/v_1.0.1', 'D': 'http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3'}

In [3]:
#zpracovane davky
names = {'2': 'firmy', '4': 'osvc'}

davky_osvc = []
davky_firmy = []


for key in names.keys():
    davky_zprac = []
    with open('/Users/jancibulka/DEVEL/DATA/ares-zmeny/data/' + names[key] + '_zprac.csv', 'r') as dvk:
        reader = csv.reader(dvk, delimiter=',', quotechar='"')
        for row in reader:
            davky_zprac.append(row[0])

    davky_akt = []
    url = urllib.request.urlopen('http://wwwinfo.mfcr.cz/cgi-bin/ares/darv_zm.cgi?cislo_zdroje=' + key + '&cislo_davky_od=2&cislo_davky_do=3')
    doc = ElementTree.parse(url)
    for node in doc.findall('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.4}C_davky'):
        davky_akt.append(node.text)

    davky = []
    for davka in davky_akt:
        if (davka not in davky_zprac):
            davky.append(davka)

    if (key == '2'):
        davky_firmy = davky
    else:
        davky_osvc = davky

In [5]:
#FIRMY

out = open('/Users/jancibulka/DEVEL/DATA/ares-zmeny/data/out_firmy.csv', 'a')
outwriter = csv.writer(out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

for davka in davky_firmy:
    url = urllib.request.urlopen('http://wwwinfo.mfcr.cz/cgi-bin/ares/darv_zm.cgi?cislo_zdroje=2&cislo_davky_od={0}&cislo_davky_do={0}'.format(davka))
    doc = ElementTree.parse(url)
    for node in doc.findall('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.4}ic'):
        statut = []
        if (node.attrib['p'] == 'N'):
            addr = urllib.request.urlopen('http://wwwinfo.mfcr.cz/cgi-bin/ares/darv_or.cgi?ico=' + node.text.strip())
            bulk = ElementTree.parse(addr)
            firma = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3}OF')
            ulice = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3}NU')
            cis_domu = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3}CD')
            obec = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3}N')
            psc = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3}PSC')
            zapsano = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3}DZOR')
            
            for statutar in bulk.findall('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3}CSO'):
                fce = statutar.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3}F')
                jmeno = statutar.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3}J')
                prijmeni = statutar.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3}P')
                nar = statutar.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.3}DN')
                
                if (fce is not None):
                    fce = fce.text.strip()
                else:
                    fce = ''
                if (jmeno is not None):
                    jmeno = jmeno.text.strip()
                else:
                    jmeno = ''
                if (prijmeni is not None):
                    prijmeni = prijmeni.text.strip()
                else:
                    prijmeni = ''
                if (nar is not None):
                    nar = nar.text.strip()
                else:
                    nar = ''
                
                statut.append(fce + ' - ' + jmeno + ' ' + prijmeni + ' ' + nar) 
            
            
            ic = node.text.strip()
            if (firma is not None):
                firma = firma.text.strip()
            else:
                firma = ''
            if (ulice is not None):
                ulice = ulice.text.strip()
            else:
                ulice = ''
            if (cis_domu is not None):    
                cis_domu = cis_domu.text.strip()
            else:
                cis_domu = ''
            if (obec is not None):
                obec = obec.text.strip()
            else:
                obec = ''
            if (psc is not None):
                psc = psc.text.strip()
            else:
                psc = ''

            if (zapsano is not None):
                zapsano = zapsano.text.strip()
            else:
                zapsano = ''
            
            #print([ic, firma, ulice, cis_domu, obec, psc, zapsano, statut])
            outwriter.writerow([ic, firma, ulice, cis_domu, obec, psc, zapsano, statut])
            time.sleep(0.75)
    
    with open('/Users/jancibulka/DEVEL/DATA/ares-zmeny/data/firmy_zprac.csv', 'a') as dvk:
        writer = csv.writer(dvk, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow([davka])

out.close()

In [12]:
davky_osvc


Out[12]:
['236']

In [19]:
#OSVC

out = open('/Users/jancibulka/DEVEL/DATA/ares-zmeny/data/out_osvc.csv', 'a')
outwriter = csv.writer(out, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

for davka in davky_osvc:
    url = urllib.request.urlopen('http://wwwinfo.mfcr.cz/cgi-bin/ares/darv_zm.cgi?cislo_zdroje=4&cislo_davky_od={0}&cislo_davky_do={0}'.format(davka))
    doc = ElementTree.parse(url)
    for node in doc.findall('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.4}ic'):
        #print(node)
        if (node.attrib['p'] == 'N'):
            addr = urllib.request.urlopen('http://wwwinfo.mfcr.cz/cgi-bin/ares/darv_std.cgi?ico=' + node.text.strip())
            bulk = ElementTree.parse(addr)
            firma = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_answer/v_1.0.1}Obchodni_firma')
            ulice = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.4}Nazev_ulice')
            cis_domu = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.4}Cislo_domovni')
            obec = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.4}Nazev_obce')
            psc = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.4}PSC')
            adresa_text = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_datatypes/v_1.0.4}Adresa_textem')
            zapsano = bulk.find('.//{http://wwwinfo.mfcr.cz/ares/xml_doc/schemas/ares/ares_answer/v_1.0.1}Datum_vzniku')
            
            ic = node.text.strip()
            if (firma is not None):
                firma = firma.text.strip()
            else:
                firma = ''
            if (ulice is not None):
                ulice = ulice.text.strip()
            else:
                ulice = ''
            if (cis_domu is not None):    
                cis_domu = cis_domu.text.strip()
            else:
                cis_domu = ''
            if (obec is not None):
                obec = obec.text.strip()
            else:
                obec = ''
            if (psc is not None):
                psc = psc.text.strip()
            else:
                psc = ''
            if (adresa_text is not None):
                adresa_text = adresa_text.text.strip()
            else:
                adresa_text = ''
            if (zapsano is not None):
                zapsano = zapsano.text.strip()
            else:
                zapsano = ''
            
            outwriter.writerow([ic, firma, ulice, cis_domu, obec, psc, adresa_text, zapsano])
            time.sleep(0.75)
            #break
    
    with open('/Users/jancibulka/DEVEL/DATA/ares-zmeny/data/osvc_zprac.csv', 'a') as dvk:
        writer = csv.writer(dvk, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow([davka])

out.close()