converting htm files in MIME format to csv files
load libraries
In [ ]:
import pandas as pd
import os
import sys
import mimetypes
import email
import glob
Store your mht files in the same folder as this notebook.
The cell below gets the file names.
In [ ]:
mht_files = glob.glob(os.path.join(os.path.curdir, '*.mht'))
the next cell parses the mht-files, splits them by content type (html, jpg, etc.) and writes the output of the chunks to the hard disk
In [ ]:
for filepath in mht_files:
# get the name of the file, e.g. ./31521derp.mht -> 31521derp
filename_base = os.path.split(filepath)[-1].split('.mht')[0]
# open mht file
with open(filepath, 'r') as f:
msg = email.message_from_file(f)
# loop over the parts in the file
for i, part in enumerate(msg.walk(), start=1):
print('chunk %g is type: '%i + part.get_content_type())
if part.get_content_maintype() == 'multipart':
print('content type is multipart, skipping chunk %g'%i)
continue
ext = mimetypes.guess_extension(part.get_content_type())
filename = filename_base + '_part-%03d%s'%(i, ext)
filename = os.path.join(os.path.curdir, filename)
print(filename)
with open(filename, 'wb') as fp:
fp.write(part.get_payload(decode=True))
get the name of the stripped files with only html content
In [ ]:
html_files = glob.glob(os.path.join(os.path.curdir, '*part*.htm*'))
In [ ]:
html_files
loop over files, clip the unnecessary data and store the csv files
In [ ]:
for filepath in html_files:
filename_base = os.path.split(filepath)[-1].split('_')[0]
# read in html, result is a list of pandas dataframes
input_html = pd.read_html(filepath, thousands='')
# the data of interest appears every three dataframes, starting from index
# two, the end is at -6 to clip the unnecessary data at the end.
# processed_html = input_html[2:-6:3]
# this seems to work better, because it checks if a decimal separator (,)
# exists in the string
processed_html = [x for x in input_html if ',' in str(x[0][0])]
# remove the index from the dataframes
processed_html_values = [x.iloc[0] for x in processed_html]
# concat the dataframes
df_processed_data = pd.concat(processed_html_values, axis=1)
# DECREPATED: index is only needed if you need the first tabel.
# add the index: the values of the first column of any (here the first) df
# in processed_html
#df_processed_data.index = processed_html[0][0].values
# write to file:
#filepath_output = os.path.join(os.path.curdir, filename_base + '.csv')
#df_processed_data.to_csv(filepath_output, encoding='utf-8')
# write transposed to file:
filepath_output = os.path.join(os.path.curdir, filename_base + '_transposed.csv')
df_processed_data.T.to_csv(filepath_output, encoding='utf-8')
In [ ]:
In [ ]: