Finland data pipeline

This is the sandbox for the finnish data.


In [1]:
from pandas import read_csv

In [3]:
finland = read_csv('projektilista.csv', encoding='windows-1252')
finland.info()


---------------------------------------------------------------------------
CParserError                              Traceback (most recent call last)
<ipython-input-3-eac9abaabd36> in <module>()
----> 1 finland = read_csv('projektilista.csv', encoding='windows-1252')
      2 finland.info()

/usr/local/lib/python3.5/dist-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    560                     skip_blank_lines=skip_blank_lines)
    561 
--> 562         return _read(filepath_or_buffer, kwds)
    563 
    564     parser_f.__name__ = name

/usr/local/lib/python3.5/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    323         return parser
    324 
--> 325     return parser.read()
    326 
    327 _parser_defaults = {

/usr/local/lib/python3.5/dist-packages/pandas/io/parsers.py in read(self, nrows)
    813                 raise ValueError('skip_footer not supported for iteration')
    814 
--> 815         ret = self._engine.read(nrows)
    816 
    817         if self.options.get('as_recarray'):

/usr/local/lib/python3.5/dist-packages/pandas/io/parsers.py in read(self, nrows)
   1312     def read(self, nrows=None):
   1313         try:
-> 1314             data = self._reader.read(nrows)
   1315         except StopIteration:
   1316             if self._first_chunk:

pandas/parser.pyx in pandas.parser.TextReader.read (pandas/parser.c:8748)()

pandas/parser.pyx in pandas.parser.TextReader._read_low_memory (pandas/parser.c:9003)()

pandas/parser.pyx in pandas.parser.TextReader._read_rows (pandas/parser.c:9731)()

pandas/parser.pyx in pandas.parser.TextReader._tokenize_rows (pandas/parser.c:9602)()

pandas/parser.pyx in pandas.parser.raise_parser_error (pandas/parser.c:23325)()

CParserError: Error tokenizing data. C error: Expected 14 fields in line 56, saw 17

In [2]:
with open('projektilista.csv', encoding='windows-1252') as csv:
    lines = csv.readlines()

In [11]:
def print_nb_fields():
    for i, line in enumerate(lines):
        nb_fields = line.split(',')
        has_quote = '"' in line
        print(i, len(nb_fields), has_quote)
        if has_quote:
            break

print_nb_fields()


0 13 False
1 14 False
2 13 False
3 13 False
4 13 False
5 13 False
6 14 False
7 14 False
8 14 False
9 14 False
10 14 False
11 14 False
12 14 False
13 14 False
14 14 False
15 14 False
16 14 False
17 14 False
18 14 False
19 14 False
20 14 False
21 14 False
22 14 False
23 14 False
24 14 False
25 14 False
26 14 False
27 14 False
28 14 False
29 14 False
30 14 False
31 14 False
32 14 False
33 14 False
34 14 False
35 14 False
36 14 False
37 14 False
38 14 False
39 14 False
40 14 False
41 14 False
42 14 False
43 14 False
44 14 False
45 14 False
46 14 False
47 14 False
48 14 False
49 14 False
50 14 False
51 14 False
52 14 False
53 14 False
54 14 False
55 17 False
56 15 False
57 14 False
58 14 False
59 15 False
60 14 False
61 14 False
62 14 False
63 14 False
64 14 False
65 14 False
66 17 False
67 14 False
68 14 False
69 14 False
70 14 False
71 14 False
72 14 False
73 14 False
74 14 False
75 14 False
76 14 False
77 14 False
78 15 True

In [12]:
lines[78]


Out[12]:
'A70378,EAKR,Hotel Auroran liiketoiminnan ja tuotteiden kehittäminen, "Hotel Auroran maisemahuoneet",1,Pohjois-Pohjanmaan elinkeino-, liikenne- ja ympäristökeskus,Toiminnassa,18.6.2014,22.6.2016,HOTEL AURORA OY,292990,246104,292990,246104\n'

In [18]:
set(nb_fields)


Out[18]:
{13, 14, 15, 16, 17, 19}

In [19]:
def get_nb_fields():
    for i, line in enumerate(lines):
        yield len(line.split(','))
    
from pandas import DataFrame
nb_fields_df = DataFrame(get_nb_fields())

In [24]:
nb_fields_df[0].value_counts()


Out[24]:
14    2009
13     709
15     141
16      18
17       3
19       2
Name: 0, dtype: int64

In [ ]: