In this notebook we create draft code for treating the data as supplied in raw form by the "Dados Abertos" initiative: http://dados.prefeitura.sp.gov.br/dataset/diario-oficial-da-cidade-de-sao-paulo

We need to unzip all files, and then add the date and data type information to each text file. That's because when we run a clustering algorithm, having the data type will be useful. Then we can take each file individually, without caring which folder it came from. For that reason we also need the date on the text file itself. It's actually very surprising that the data provided in raw form does not already give us that information intrinsically...



In [1]:

    
import zipfile



In [2]:

    
# Define a couple of helper functions for formatting paths
def format_file(path, folder, file):
  return path + "/" + folder + "/" + file

def format_folder(path, folder):
  return path + "/" + folder

# Little helper function to get subdirectories
import os
def get_immediate_subdirectories(a_dir):
    return [name for name in os.listdir(a_dir)
            if os.path.isdir(os.path.join(a_dir, name))]



In [3]:

    
# All the data to be processed is kept here, in zipped format
relative_root_path = "../raw"
processed_data_path = "../raw/processed"

folders = get_immediate_subdirectories(relative_root_path)
for folder in folders:
  files = os.listdir(format_folder(relative_root_path, folder))
  for file in files:
    with zipfile.ZipFile(format_file(relative_root_path, folder, file),"r") as zip_ref:
      zip_ref.extractall(format_folder(processed_data_path,file))
      zip_ref.close()









    




IOErrorTraceback (most recent call last)
<ipython-input-3-ae9bb4039e3a> in <module>()
      7   files = os.listdir(format_folder(relative_root_path, folder))
      8   for file in files:
----> 9     with zipfile.ZipFile(format_file(relative_root_path, folder, file),"r") as zip_ref:
     10       zip_ref.extractall(format_folder(processed_data_path,file))
     11       zip_ref.close()

/usr/lib/python2.7/zipfile.pyc in __init__(self, file, mode, compression, allowZip64)
    754             modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
    755             try:
--> 756                 self.fp = open(file, modeDict[mode])
    757             except IOError:
    758                 if mode == 'a':

IOError: [Errno 21] Is a directory: '../raw/processed_old/1706DOM.zip'

Now that all the files have been extracted, we need put them in a structure. The first thing we need to do is get the date and type (licitacao or publicacao) information and add it to each document.



In [4]:

    
# First let's define a couple of helper functions to pre-pend data on a text file:
def line_prepender(filename, line):
    with open(filename, 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        f.write(line.rstrip('\r\n') + '\n' + content)
        
def format_line(date, data_type):
  return date + "06 " + data_type + " "



In [5]:

    
dates = get_immediate_subdirectories(processed_data_path)
for date_uf in dates:
  # the date is the 4 first digits of the name of the top most folder in each structure
  date = date_uf[:4]
  path_to_date = format_folder(processed_data_path, date_uf)
  second_level = get_immediate_subdirectories(path_to_date)
  for directoryL2 in second_level:
    # Sadly the data structure is very dirty, with the order of folders being swapped around in some cases.
    # So we need to do two loops.
    new_date = format_folder(path_to_date, directoryL2)
    if directoryL2 == "Licitacao" or directoryL2 == "Publicacao":
      data_type = directoryL2
      for filename in os.listdir(new_date):
        if filename.endswith(".txt") of filename.endswith(".TXT"):
          line = format_line(date, data_type)
          line_prepender(format_folder(new_date,filename),line)
    else:
      third_level = get_immediate_subdirectories(new_date)
      for directoryL3 in third_level:
        if directoryL3 == "Licitacao" or directoryL3 == "Publicacao":
          data_type = directoryL3
          for filename in os.listdir(format_folder(new_date, directoryL3)):
            if filename.endswith(".txt"): 
              # prepend each file with the date (adding year) and the data type (procuracao or licitacao)
              line = format_line(date, data_type)
              line_prepender(format_folder(format_folder(new_date, directoryL3),filename),line)



In [ ]: