converting htm files in MIME format to csv files

load libraries



In [ ]:

    
import pandas as pd
import os
import sys
import mimetypes
import email
import glob

ref: http://stackoverflow.com/questions/7446284/read-mime-message-stored-in-text-file-with-python

Store your mht files in the same folder as this notebook.

The cell below gets the file names.



In [ ]:

    
mht_files = glob.glob(os.path.join(os.path.curdir, '*.mht'))

the next cell parses the mht-files, splits them by content type (html, jpg, etc.) and writes the output of the chunks to the hard disk



In [ ]:

    
for filepath in mht_files:
    # get the name of the file, e.g. ./31521derp.mht -> 31521derp
    filename_base = os.path.split(filepath)[-1].split('.mht')[0]
    # open mht file
    with open(filepath, 'r') as f:
        msg = email.message_from_file(f)
    # loop over the parts in the file
    for i, part in enumerate(msg.walk(), start=1):
        print('chunk %g is type: '%i + part.get_content_type())
        if part.get_content_maintype() == 'multipart':
            print('content type is multipart, skipping chunk %g'%i)
            continue
        ext = mimetypes.guess_extension(part.get_content_type())
        filename = filename_base + '_part-%03d%s'%(i, ext)
        filename = os.path.join(os.path.curdir, filename)
        print(filename)
        with open(filename, 'wb') as fp:
            fp.write(part.get_payload(decode=True))

get the name of the stripped files with only html content



In [ ]:

    
html_files = glob.glob(os.path.join(os.path.curdir, '*part*.htm*'))



In [ ]:

    
html_files

loop over files, clip the unnecessary data and store the csv files



In [ ]:

    
for filepath in html_files:
    filename_base = os.path.split(filepath)[-1].split('_')[0]
    # read in html, result is a list of pandas dataframes
    input_html = pd.read_html(filepath, thousands='')
    # the data of interest appears every three dataframes, starting from index
    # two, the end is at -6 to clip the unnecessary data at the end.
    # processed_html = input_html[2:-6:3]
    # this seems to work better, because it checks if a decimal separator (,)
    # exists in the string 
    processed_html = [x for x in input_html if ',' in str(x[0][0])]
    # remove the index from the dataframes
    processed_html_values = [x.iloc[0] for x in processed_html]
    # concat the dataframes
    df_processed_data = pd.concat(processed_html_values, axis=1)
    # DECREPATED: index is only needed if you need the first tabel.
    # add the index: the values of the first column of any (here the first) df
    # in processed_html
    #df_processed_data.index = processed_html[0][0].values
    # write to file:
    #filepath_output = os.path.join(os.path.curdir, filename_base + '.csv')
    #df_processed_data.to_csv(filepath_output, encoding='utf-8')
    # write transposed to file:
    filepath_output = os.path.join(os.path.curdir, filename_base + '_transposed.csv')
    df_processed_data.T.to_csv(filepath_output, encoding='utf-8')



In [ ]:



In [ ]:

Table of Contents