In [1]:
import re
filename = 'tabula-Actelion_transparency-report-2015'
file = open(filename+'.csv', 'r')
content = file.readlines()
content[:10]
Out[1]:
In [18]:
twodigits = re.compile('\.?\d{2}"?$')
#re.match(twodigits, '14"')
re.match(twodigits, '.14')
Out[18]:
In [21]:
c = 0
def join_broken_lines(line_list):
global c
full_lines = []
full_line = ''
for line in line_list:
full_line += line[:-1]
if len(line) > 1:
if line[-2] == '"' or re.match(twodigits, line[-4:-1]):
# full_lines.append(str(c)+") " + full_line)
full_lines.append(str(c)+") " + full_line)
full_line = ''
c += 1
return full_lines
result = join_broken_lines(content)
print(*result[:5], sep="\n") # we print the 5 first lines
In [22]:
newcontent = str.join("\n", result)
fp = open(filename + '-corrected.csv', 'w')
fp.write(newcontent)
fp.close()
In [ ]: