In [1]:
import csv
r = csv.reader(open('files.csv'))
files = [ro for ro in r]
len(files)


Out[1]:
283

In [2]:
files[:2]


Out[2]:
[['2016-01-20',
  'http://apps.who.int/gho/data/view.ebola-sitrep.ebola-summary-20160120?lang=en'],
 ['2016-01-13',
  'http://apps.who.int/gho/data/view.ebola-sitrep.ebola-summary-20160113?lang=en']]

In [85]:
DATA_DIR = 'data'
FIELDS = ["CSV Source date", "Country", "Data as of", "Case definition", "Number of cases (Cumulative)", "Number of cases (In past 21 days)", "Number of deaths (cumulative)"]

In [12]:
testfile = csv.reader(open('%s/%s.csv' % (DATA_DIR, files[0][0])))
data = [r for r in testfile]
len(data[0])


Out[12]:
6

In [119]:
skipped_rows = []
datas = []
rows = []
for f in reversed(files):
    _data = [r for r in csv.reader(open('%s/%s.csv' % (DATA_DIR, f[0])))]
    data = _data[2:]
    for i in _data[:2]:
        skipped_rows += [_i.strip() for _i in i]
    datas.append(f[0])
    for da in data:
        ds = []
        for d in da:
            try:
                ds.append(str(int(d.strip().replace(' ',''))))
            except ValueError:
                if d.strip() == '-':
                    d = ''
                ds.append(d.strip())
        row = [f[0]] + ds
        rows.append(row)
(len(rows), len(datas), len(files))


Out[119]:
(4854, 283, 283)

In [120]:
rows[0]


Out[120]:
['2014-11-12', 'Guinea', 'Confirmed', '1612', '313', '934']

In [121]:
len(rows) / len(files)


Out[121]:
17.151943462897528

In [122]:
rows[:20]


Out[122]:
[['2014-11-12', 'Guinea', 'Confirmed', '1612', '313', '934'],
 ['2014-11-12', 'Guinea', 'Probable', '208', '12', '208'],
 ['2014-11-12', 'Guinea', 'Suspected', '58', 'Not reported', '0'],
 ['2014-11-12', 'Guinea', 'Total', '1878', '325', '1142'],
 ['2014-11-12', 'Liberia', 'Confirmed', '2553', '335', 'Not reported'],
 ['2014-11-12', 'Liberia', 'Probable', '1687', '131', 'Not reported'],
 ['2014-11-12',
  'Liberia',
  'Suspected',
  '2582',
  'Not reported',
  'Not reported'],
 ['2014-11-12', 'Liberia', 'Total', '6822', '466', '2836'],
 ['2014-11-12', 'Sierra Leone', 'Confirmed', '4523', '1197', '960'],
 ['2014-11-12', 'Sierra Leone', 'Probable', '79', '14', '174'],
 ['2014-11-12', 'Sierra Leone', 'Suspected', '766', 'Not reported', '35'],
 ['2014-11-12', 'Sierra Leone', 'Total', '5368', '1211', '1169'],
 ['2014-11-14', 'Guinea', 'Confirmed', '1647', '958'],
 ['2014-11-14', 'Guinea', 'Probable', '208', '208'],
 ['2014-11-14', 'Guinea', 'Suspected', '64', '0'],
 ['2014-11-14', 'Guinea', 'Total', '1919', '1166'],
 ['2014-11-14', 'Liberia', 'Confirmed', '2562', 'Not reported'],
 ['2014-11-14', 'Liberia', 'Probable', '1716', 'Not reported'],
 ['2014-11-14', 'Liberia', 'Suspected', '2600', 'Not reported'],
 ['2014-11-14', 'Liberia', 'Total', '6878', '2812']]

In [123]:
list(set([len(r) for r in rows]))


Out[123]:
[5, 6, 7]

In [124]:
[r for r in rows if len(r) == 5][0]


Out[124]:
['2014-11-14', 'Guinea', 'Confirmed', '1647', '958']

In [125]:
[r for r in rows if len(r) == 6][0]


Out[125]:
['2014-11-12', 'Guinea', 'Confirmed', '1612', '313', '934']

In [126]:
[r for r in rows if len(r) == 7][0]


Out[126]:
['2014-12-03',
 'Guinea',
 '30 November 2014',
 'Confirmed',
 '1929',
 '306',
 '1117']

In [127]:
data = rows[0]
data[:2]


Out[127]:
['2014-11-12', 'Guinea']

In [128]:
data


Out[128]:
['2014-11-12', 'Guinea', 'Confirmed', '1612', '313', '934']

In [129]:
data[2:]


Out[129]:
['Confirmed', '1612', '313', '934']

In [130]:
csv_rows = []
for data in rows:
    if len(data) == 5:
        row = ','.join(data[:2] + [''] + [data[2]] + [data[3]] + [''] + [data[4]])
    if len(data) == 6:
        row = ','.join(data[:2] + [''] + data[2:])
    if len(data) == 7:
        row = ','.join(data)
    csv_rows.append(row)
len(csv_rows) == len(rows)


Out[130]:
True

In [131]:
with open('full_data.csv', 'w') as f:
    f.writelines('%s\n' % ','.join(FIELDS))
    for l in csv_rows:
        f.writelines('%s\n' % l)

In [132]:
len(skipped_rows) / 2


Out[132]:
1473.0

In [133]:
list(set(skipped_rows))


Out[133]:
['',
 'Data as of',
 'Case definition',
 'Country',
 'Number of cases',
 'In past 21 days',
 'Cumulative',
 'Number of deaths']

In [ ]: