In [1]:
import csv
r = csv.reader(open('files.csv'))
files = [ro for ro in r]
len(files)
Out[1]:
In [2]:
files[:2]
Out[2]:
In [85]:
DATA_DIR = 'data'
FIELDS = ["CSV Source date", "Country", "Data as of", "Case definition", "Number of cases (Cumulative)", "Number of cases (In past 21 days)", "Number of deaths (cumulative)"]
In [12]:
testfile = csv.reader(open('%s/%s.csv' % (DATA_DIR, files[0][0])))
data = [r for r in testfile]
len(data[0])
Out[12]:
In [119]:
skipped_rows = []
datas = []
rows = []
for f in reversed(files):
_data = [r for r in csv.reader(open('%s/%s.csv' % (DATA_DIR, f[0])))]
data = _data[2:]
for i in _data[:2]:
skipped_rows += [_i.strip() for _i in i]
datas.append(f[0])
for da in data:
ds = []
for d in da:
try:
ds.append(str(int(d.strip().replace(' ',''))))
except ValueError:
if d.strip() == '-':
d = ''
ds.append(d.strip())
row = [f[0]] + ds
rows.append(row)
(len(rows), len(datas), len(files))
Out[119]:
In [120]:
rows[0]
Out[120]:
In [121]:
len(rows) / len(files)
Out[121]:
In [122]:
rows[:20]
Out[122]:
In [123]:
list(set([len(r) for r in rows]))
Out[123]:
In [124]:
[r for r in rows if len(r) == 5][0]
Out[124]:
In [125]:
[r for r in rows if len(r) == 6][0]
Out[125]:
In [126]:
[r for r in rows if len(r) == 7][0]
Out[126]:
In [127]:
data = rows[0]
data[:2]
Out[127]:
In [128]:
data
Out[128]:
In [129]:
data[2:]
Out[129]:
In [130]:
csv_rows = []
for data in rows:
if len(data) == 5:
row = ','.join(data[:2] + [''] + [data[2]] + [data[3]] + [''] + [data[4]])
if len(data) == 6:
row = ','.join(data[:2] + [''] + data[2:])
if len(data) == 7:
row = ','.join(data)
csv_rows.append(row)
len(csv_rows) == len(rows)
Out[130]:
In [131]:
with open('full_data.csv', 'w') as f:
f.writelines('%s\n' % ','.join(FIELDS))
for l in csv_rows:
f.writelines('%s\n' % l)
In [132]:
len(skipped_rows) / 2
Out[132]:
In [133]:
list(set(skipped_rows))
Out[133]:
In [ ]: