Table of Contents

converting htm files in MIME format to csv files

load libraries


In [ ]:
import pandas as pd
import os
import sys
import mimetypes
import email
import glob

Store your mht files in the same folder as this notebook.

The cell below gets the file names.


In [ ]:
mht_files = glob.glob(os.path.join(os.path.curdir, '*.mht'))

the next cell parses the mht-files, splits them by content type (html, jpg, etc.) and writes the output of the chunks to the hard disk


In [ ]:
for filepath in mht_files:
    # get the name of the file, e.g. ./31521derp.mht -> 31521derp
    filename_base = os.path.split(filepath)[-1].split('.mht')[0]
    # open mht file
    with open(filepath, 'r') as f:
        msg = email.message_from_file(f)
    # loop over the parts in the file
    for i, part in enumerate(msg.walk(), start=1):
        print('chunk %g is type: '%i + part.get_content_type())
        if part.get_content_maintype() == 'multipart':
            print('content type is multipart, skipping chunk %g'%i)
            continue
        ext = mimetypes.guess_extension(part.get_content_type())
        filename = filename_base + '_part-%03d%s'%(i, ext)
        filename = os.path.join(os.path.curdir, filename)
        print(filename)
        with open(filename, 'wb') as fp:
            fp.write(part.get_payload(decode=True))

get the name of the stripped files with only html content


In [ ]:
html_files = glob.glob(os.path.join(os.path.curdir, '*part*.htm*'))

In [ ]:
html_files

loop over files, clip the unnecessary data and store the csv files


In [ ]:
for filepath in html_files:
    filename_base = os.path.split(filepath)[-1].split('_')[0]
    # read in html, result is a list of pandas dataframes
    input_html = pd.read_html(filepath, thousands='')
    # the data of interest appears every three dataframes, starting from index
    # two, the end is at -6 to clip the unnecessary data at the end.
    # processed_html = input_html[2:-6:3]
    # this seems to work better, because it checks if a decimal separator (,)
    # exists in the string 
    processed_html = [x for x in input_html if ',' in str(x[0][0])]
    # remove the index from the dataframes
    processed_html_values = [x.iloc[0] for x in processed_html]
    # concat the dataframes
    df_processed_data = pd.concat(processed_html_values, axis=1)
    # DECREPATED: index is only needed if you need the first tabel.
    # add the index: the values of the first column of any (here the first) df
    # in processed_html
    #df_processed_data.index = processed_html[0][0].values
    # write to file:
    #filepath_output = os.path.join(os.path.curdir, filename_base + '.csv')
    #df_processed_data.to_csv(filepath_output, encoding='utf-8')
    # write transposed to file:
    filepath_output = os.path.join(os.path.curdir, filename_base + '_transposed.csv')
    df_processed_data.T.to_csv(filepath_output, encoding='utf-8')

In [ ]:


In [ ]: