In [360]:
from lxml import etree
import ssl
import urllib2

from bs4 import BeautifulSoup as bs

# download first page
def load_page(url):
    #join url and page
    # have to use ssl to fix this 
    # http://stackoverflow.com/questions/27835619/ssl-certificate-verify-failed-error
    context = ssl._create_unverified_context()
    response = urllib2.urlopen(url, context=context)
    html = response.read()
    soup = bs(html)    
    return soup

def remove_double_whitespace(text):
    return '\n'.join(' '.join(line.split()) for line in text.split('\n'))

def convert_htmltable_to_dict(table_body):
    # http://stackoverflow.com/questions/23377533/python-beautifulsoup-parsing-table
    rows = table_body.find_all('tr')
    data = []
    for i_row, row in enumerate(rows):
        cols = row.find_all('td')
        parsed_cols = []
        for i_ele, ele in enumerate(cols) :
            # get links on row >= 2and last column
            if (i_ele == len(cols)-1) and i_row > 1:
                parsed_cols.append(ele.find('a').get('href'))               
            else :
                parsed_cols.append(remove_double_whitespace(ele.text.strip()))
        data.append(parsed_cols) 
    return data

def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

def find_table(tags_soup):
    # return table_soup
    pass
    
#class PageLoader(object):
    
def parse_html(page_soup):
    # find table
    page_soup.find
    pass

class PageLoadError(Exception):
    """Error when loading page"""

# berkas 
class Berkas(object):
    base_url = 'https://www.kejaksaan.go.id'
    
    def __init__(self, data):
        self.nomor_urut = data[0]
        self.nomor_perkara = data[1]
        self.nama_terdakwa = data[2] # parse the alias
        self.tanggal_dakwaan = data[3] # convert to python date
        self.wilayah_hukum = data[4] 
        self.detail_link = data[5]

        self.url = self.base_url + self.detail_link

    def convert_htmltable_to_dict(self, table_body):
        # http://stackoverflow.com/questions/23377533/python-beautifulsoup-parsing-table
        rows = table_body.find_all('tr')
        data = []
        for i_row, row in enumerate(rows):
            cols = row.find_all('td')
            parsed_cols = []
            for i_ele, ele in enumerate(cols) :
                parsed_cols.append(remove_double_whitespace(ele.text.strip()))
            data.append(parsed_cols) 
        return data

    def load_page(self):
        self.page_html = load_page(self.url)
        
    def load_detail(self):
        self.load_page()
        cell = filter(lambda x : 'Nomor Perkara' in x.text, self.page_html.find_all('td'))
        table = cell[2].parent.parent
        self.table_dict = self.convert_htmltable_to_dict(table)
        #return detail_data # instance dict, contain details
        
class Pidana(object):
    base_url = 'https://www.kejaksaan.go.id'
    api_url = '/berkas-dakwaan.php?'
    
    def __init__(self):
        self.max_page = None
        self.total_berkas = None
    
    def load_page(self, page_num=None):
        self.daftar_berkas = []
        
        url = self.base_url + self.api_url + self.api_attribute
        url_template = url + "&hal=%s"
        
        if page_num == 1:
            url_dest = url_template % str(1)
            self.parse_page( url_dest)
        elif page_num == None:
            url_dest = url_template % str(1)
            self.parse_page(url_dest)
            
            # dont parse all page if no max page defined
            if self.max_page is None:
                for x in range(2, self.max_page):
                    url_dest = url_template % str(x)
                    self.parse_page(url_dest)
            else:
                raise LoadPageError
        else :
            url_dest = url_template % str(page_num)
            self.parse_page(url_dest)
        # TODO raise error when page not found
            
    def parse_page(self, url):
        page_html = load_page(url)
        
        # parse table
        filter_result = filter(lambda x:"Total Data Berkas" in x.text, phtml.find_all('td'))
        table = filter_result[1].parent.parent
        dict_result = convert_htmltable_to_dict(table)
        
        # find total_berkas
        if self.total_berkas is None:
            self.total_berkas = int(find_between(dict_result[0][0],':','perkara').strip())
        
        # find max_page
        if self.max_page is None:
            last_page_url = filter(lambda x:">>" in x.text, phtml.find_all('a',{'class':'mn2'}))
            self.max_page = find_between(last_page_url[0].get('href'),'hal=','&')            
        
        for y,x in enumerate(dict_result):
            if y>1 : # ignore the first two row
                self.daftar_berkas.append(Berkas(x)) 

class PidanaUmum(Pidana):
    api_attribute = 'unt=1'

    def __ini__(self):
        super(PidanaUmum, self).__init__()

class PidanaKhusus(Pidana):
    api_attribute = 'unt=2'
    
    def __ini__(self):
        super(PidanaKhusus, self).__init__()

In [358]:
pu = PidanaUmum()

In [359]:
pu.load_page(30)

In [63]:
phtml = load_page('https://www.kejaksaan.go.id/berkas-dakwaan.php?hal=1&unt=1')

In [296]:
last_page_url = filter(lambda x:">>" in x.text, phtml.find_all('a',{'class':'mn2'}))
find_between(last_page_url[0].get('href'),'hal=','&')


Out[296]:
'12'

In [64]:
filter_result = filter(lambda x:"Total Data Berkas" in x.text, phtml.find_all('td'))

In [141]:
table = filter_result[1].parent.parent
#filter_result[1].parent.name
dict_result = convert_htmltable_to_dict(table)
from pprint import pprint
pprint(dict_result[:3])


[[u'Total Data Berkas Dakwaan\nPidana Umum :\n221 perkara'],
 [u'No. Urut',
  u'No. Perkara',
  u'Nama Terdakwa',
  u'Tanggal Dakwaan',
  u'Wilayah Hukum',
  u'Aksi'],
 [u'1',
  u'PDM-34/JKTUT/2014',
  u'ADE SUNARYO BIN AJU',
  u'29-01-2014',
  u'KEJAKSAAN NEGERI JAKARTA UTARA',
  './berkas-dakwaan.php?idd=368&unt=1']]

In [284]:
daftar_berkas = []
for y,x in enumerate(dict_result):
    if y>1 :
        daftar_berkas.append(Berkas(x)) 
        
berkas = daftar_berkas[0]
berkas.load_page()

print daftar_berkas[:2]


[<__main__.Berkas object at 0xae2c1e4c>, <__main__.Berkas object at 0xae2c132c>]

In [270]:
berkas.url


Out[270]:
'https://www.kejaksaan.go.id./berkas-dakwaan.php?idd=368&unt=1'

In [274]:
cell = filter(lambda x : 'Nomor Perkara' in x.text, berkas.page_html.find_all('td'))
table = cell[2].parent.parent
table_dict = berkas.convert_htmltable_to_dict(table)

In [275]:
table_dict


Out[275]:
[[u'Nomor Perkara', u'PDM-34/JKTUT/2014'],
 [u'Jenis Dakwaan', u'Pidana Umum'],
 [u'Wilayah Hukum', u'KEJAKSAAN NEGERI JAKARTA UTARA'],
 [u'Nama JPU',
  u'JUNIATI TINA MELINDA|230027078|Jaksa Fungsional|III/c (Jaksa Pratama)#MALINI SIANTURI|230022872|Jaksa Fungsional|III/d (Jaksa Muda)#'],
 [u'Tanggal Dakwaan', u'29-01-2014'],
 [u'Summary Dakwaan',
  u'TINDAK PIDANA PENCURIAN DENAGN PEMBERTAN YANG\nTERJADI PADA HARI SELASA TANGGAL 18 JUNI 2013\nSEKIRA PUKUL 07.30 WIB BERTEMPAT JL. PADEMANGAN\nIIA GG.3 NO.21A RT.006/006 KEL. PADEMANGAN KEC.\nPADEMANGAN JAKARTA UTARA'],
 [u'Tuntutan', ''],
 [u'Identitas tersangka/terdakwa'],
 [u'Terdakwa 1', ''],
 [u'Nama', u'ADE SUNARYO BIN AJU'],
 [u'Tempat/tanggal lahir', u'CIANJUR / 02-06-1980'],
 [u'Jenis Kelamin', u'LAKI-LAKI'],
 [u'Warga negara', u'Indonesia'],
 [u'Tempat tinggal',
  u'JL. GUNG BATU RT.14/004 KEL. SINARGALIH KEC. SINDANG BARAT KAB. CIANJUR JAWA BARAT'],
 [u'Agama', u'ISLAM'],
 [u'Pekerjaan', u'WIRASASTA'],
 [u'Pendidikan', u'SMA/SLTA'],
 [u'Amar Putusan PN',
  u'PIDANA BADAN 1 TAHUN 6 BULAN 0 HARI BIAYA PERKARA SEBESAR Rp 5000'],
 [u'Status', u'KEKUATAN HUKUM TETAP']]

In [279]:
berkas.load_page()