In [15]:
import pandas as pd
pd.read_csv(r'../../buschmais-spring-petclinic/git.log', names=["data"]).head()
Out[15]:
data
0
\t\t\t56098dd\t1488121379\tDirk Mahler
1
\t\t\t5dade92\t1487800842\tMarkus Harrer
2
1\t1\tdocs/README.md
3
\t\t\t9f07e8c\t1487800618\tMarkus Harrer
4
76\t0\tdocs/README.md
In [5]:
pd.read_csv("../../linux/git_linux.bz2", sep="#", compression="bz2")
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens (pandas\_libs\parsers.c:14858)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype (pandas\_libs\parsers.c:17119)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._string_convert (pandas\_libs\parsers.c:17347)()
pandas\_libs\parsers.pyx in pandas._libs.parsers._string_box_utf8 (pandas\_libs\parsers.c:23041)()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 2: invalid continuation byte
During handling of the above exception, another exception occurred:
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-5-fbd25ade74db> in <module>()
----> 1 pd.read_csv("../../linux/git_linux.bz2", sep="#", compression="bz2")
C:\dev\apps\Anaconda3\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
653 skip_blank_lines=skip_blank_lines)
654
--> 655 return _read(filepath_or_buffer, kwds)
656
657 parser_f.__name__ = name
C:\dev\apps\Anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
409
410 try:
--> 411 data = parser.read(nrows)
412 finally:
413 parser.close()
C:\dev\apps\Anaconda3\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
980 raise ValueError('skipfooter not supported for iteration')
981
--> 982 ret = self._engine.read(nrows)
983
984 if self.options.get('as_recarray'):
C:\dev\apps\Anaconda3\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1717 def read(self, nrows=None):
1718 try:
-> 1719 data = self._reader.read(nrows)
1720 except StopIteration:
1721 if self._first_chunk:
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read (pandas\_libs\parsers.c:10862)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory (pandas\_libs\parsers.c:11138)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows (pandas\_libs\parsers.c:12175)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_column_data (pandas\_libs\parsers.c:14136)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens (pandas\_libs\parsers.c:14972)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype (pandas\_libs\parsers.c:17119)()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._string_convert (pandas\_libs\parsers.c:17347)()
pandas\_libs\parsers.pyx in pandas._libs.parsers._string_box_utf8 (pandas\_libs\parsers.c:23041)()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 2: invalid continuation byte
In [35]:
pd.read_csv(r"C:\Temp\git.log", encoding="latin-1", sep="#", header=None)
Out[35]:
0
0
1504030581 Linus Torvalds
1
1504030432 Linus Torvalds
2
1504023066 Linus Torvalds
3
1504021018 Tejun Heo
4
1504010526 Christoph Hellwig
5
1503963940 Linus Torvalds
6
1503957087 Tejun Heo
7
1503957838 Alexey Brodkin
8
1503520620 Helge Deller
9
1503944146 Linus Torvalds
10
1503901688 Christoph Hellwig
11
1503879640 Linus Torvalds
12
1503879034 Linus Torvalds
13
1503878917 Linus Torvalds
14
1503878613 Linus Torvalds
15
1503878514 Linus Torvalds
16
1503876309 Linus Torvalds
17
1503867312 Linus Torvalds
18
1503861145 Linus Torvalds
19
1503776909 Linus Torvalds
20
1503776774 Linus Torvalds
21
1503763588 Linus Torvalds
22
1503763338 Linus Torvalds
23
1503763190 Linus Torvalds
24
1503709347 Linus Torvalds
25
1503708383 Linus Torvalds
26
1503708003 Linus Torvalds
27
1503707555 Linus Torvalds
28
1503707246 Linus Torvalds
29
1503706953 Linus Torvalds
...
...
693293
1113690257 Paul Mackerras
693294
1113690257 Paul Mackerras
693295
1113690256 Paul Mackerras
693296
1113690255 Eugene Surovegin
693297
1113690254 Paul Mackerras
693298
1113690253 James Morris
693299
1113690251 Jean Tourrilhes
693300
1113690250 Herbert Xu
693301
1113690249 Arnaldo Carvalho de Melo
693302
1113690249 David S. Miller
693303
1113690248 Martin Hicks
693304
1113690247 akpm@osdl.org
693305
1113690246 akpm@osdl.org
693306
1113690245 Andrea Arcangeli
693307
1113690245 Jeff Moyer
693308
1113690244 Dave Airlie
693309
1113690243 James Morris
693310
1113690242 akpm@osdl.org
693311
1113690241 David S. Miller
693312
1113690241 Stas Sergeev
693313
1113690240 akpm@osdl.org
693314
1113690239 James Bottomley
693315
1113690238 Artem B. Bityuckiy
693316
1113690237 akpm@osdl.org
693317
1113690237 akpm@osdl.org
693318
1113690236 akpm@osdl.org
693319
1113690235 akpm@osdl.org
693320
1113690234 Neil Brown
693321
1113690233 Christoph Lameter
693322
1113690036 Linus Torvalds
693323 rows × 1 columns
In [31]:
pd.read_csv("../../linux/git.log", encoding="latin-1", sep="#")
Out[31]:
40c6d1b9e2fc
Tue Aug 15 12:49:43 2017 -0700
1502826583
Linus Torvalds
0
fcd07350007b
Mon Aug 14 13:09:59 2017 -0700
1502741399
Linus Torvalds
1
6b9d1c24e051
Mon Aug 14 11:35:56 2017 -0700
1502735756
Linus Torvalds
2
ef954844c7ac
Sun Aug 13 16:01:32 2017 -0700
1502665292
Linus Torvalds
3
b2298fc900f8
Sun Aug 13 15:34:28 2017 -0700
1502663668
Linus Torvalds
4
c9dc281d91ae
Sun Aug 13 12:44:18 2017 -0700
1502653458
Linus Torvalds
5
ce7ba95cf078
Sun Aug 13 12:41:58 2017 -0700
1502653318
Linus Torvalds
6
438630ef5b3c
Sun Aug 13 12:33:35 2017 -0700
1502652815
Linus Torvalds
7
dd95f1860768
Sun Aug 13 12:30:17 2017 -0700
1502652617
Linus Torvalds
8
10cec917d000
Sun Aug 13 12:27:42 2017 -0700
1502652462
Linus Torvalds
9
89a55278dee4
Sat Aug 12 16:19:43 2017 -0700
1502579983
Linus Torvalds
10
9a51544774a5
Wed Aug 2 18:03:05 2017 +0530
1501677185
Abhishek Sahu
11
a99bcdce8395
Sat Aug 12 12:08:59 2017 -0700
1502564939
Linus Torvalds
12
043cd07c555f
Sat Aug 12 09:01:36 2017 -0700
1502553696
Linus Torvalds
13
afc1f55ca44e
Fri Aug 11 20:34:45 2017 -0700
1502508885
Shaohua Li
14
216e4a1def29
Fri Aug 11 13:54:09 2017 -0700
1502484849
Linus Torvalds
15
e0d0e045b862
Fri Aug 11 12:26:49 2017 -0700
1502479609
Linus Torvalds
16
0993133bb8e7
Fri Aug 11 11:56:54 2017 -0700
1502477814
Linus Torvalds
17
7eb97ba611f5
Fri Aug 11 11:44:18 2017 -0700
1502477058
Linus Torvalds
18
2bfc37cdef08
Fri Aug 11 11:20:48 2017 -0700
1502475648
Linus Torvalds
19
7d7a827ba92c
Fri Aug 11 11:15:51 2017 -0700
1502475351
Linus Torvalds
20
8a9d6e964d31
Sat Aug 5 10:59:14 2017 +0200
1501923554
Christoph Hellwig
21
8001a975f955
Fri Aug 11 08:56:01 2017 -0700
1502466961
Linus Torvalds
22
622b2fbe625b
Wed Aug 9 15:59:10 2017 -0600
1502315950
Shuah Khan
23
a7990c647b35
Tue Aug 8 12:26:02 2017 +0200
1502187962
Artem Savkov
24
020db9d3c1dc
Sun Jul 30 00:59:57 2017 +0800
1501347597
Liu Shuo
25
529871bb3c06
Fri Jul 28 16:53:55 2017 +0200
1501253635
Juergen Gross
26
4a8b53be6404
Fri Aug 11 08:07:19 2017 -0600
1502460439
Jens Axboe
27
4ca83dcf4e3b
Fri Jul 28 12:23:14 2017 +0200
1501237394
Juergen Gross
28
10231f69eb03
Fri Jul 28 12:23:13 2017 +0200
1501237393
Juergen Gross
29
c138d81163d8
Fri Jul 28 12:23:12 2017 +0200
1501237392
Juergen Gross
...
...
...
...
...
692854
6c26e03b2db4
Sat Apr 16 15:24:17 2005 -0700
1113690257
Paul Mackerras
692855
e378cc16b0d3
Sat Apr 16 15:24:17 2005 -0700
1113690257
Paul Mackerras
692856
6460b4cceba0
Sat Apr 16 15:24:16 2005 -0700
1113690256
Paul Mackerras
692857
35b535d9cc8d
Sat Apr 16 15:24:15 2005 -0700
1113690255
Eugene Surovegin
692858
16acbc624e2b
Sat Apr 16 15:24:14 2005 -0700
1113690254
Paul Mackerras
692859
0c9b79429c83
Sat Apr 16 15:24:13 2005 -0700
1113690253
James Morris
692860
7e5c6bc0a600
Sat Apr 16 15:24:11 2005 -0700
1113690251
Jean Tourrilhes
692861
6775cab98b89
Sat Apr 16 15:24:10 2005 -0700
1113690250
Herbert Xu
692862
2a27805127ae
Sat Apr 16 15:24:09 2005 -0700
1113690249
Arnaldo Carvalho de Melo
692863
9f3786dc8b1d
Sat Apr 16 15:24:09 2005 -0700
1113690249
David S. Miller
692864
4c4c402d6cab
Sat Apr 16 15:24:08 2005 -0700
1113690248
Martin Hicks
692865
76c3073a888a
Sat Apr 16 15:24:07 2005 -0700
1113690247
akpm@osdl.org
692866
323aca6c0bda
Sat Apr 16 15:24:06 2005 -0700
1113690246
akpm@osdl.org
692867
79befd0c08c4
Sat Apr 16 15:24:05 2005 -0700
1113690245
Andrea Arcangeli
692868
d345734267db
Sat Apr 16 15:24:05 2005 -0700
1113690245
Jeff Moyer
692869
41aac24f8fb5
Sat Apr 16 15:24:04 2005 -0700
1113690244
Dave Airlie
692870
388c69789a2a
Sat Apr 16 15:24:03 2005 -0700
1113690243
James Morris
692871
1db7fc75a410
Sat Apr 16 15:24:02 2005 -0700
1113690242
akpm@osdl.org
692872
51410d3c53d8
Sat Apr 16 15:24:01 2005 -0700
1113690241
David S. Miller
692873
5df240826c90
Sat Apr 16 15:24:01 2005 -0700
1113690241
Stas Sergeev
692874
e493073d8d05
Sat Apr 16 15:24:00 2005 -0700
1113690240
akpm@osdl.org
692875
81ddef77bb77
Sat Apr 16 15:23:59 2005 -0700
1113690239
James Bottomley
692876
9ffb7146f0aa
Sat Apr 16 15:23:58 2005 -0700
1113690238
Artem B. Bityuckiy
692877
d42ce812b8a3
Sat Apr 16 15:23:57 2005 -0700
1113690237
akpm@osdl.org
692878
7a228aaa879c
Sat Apr 16 15:23:57 2005 -0700
1113690237
akpm@osdl.org
692879
7aa52f5128b0
Sat Apr 16 15:23:56 2005 -0700
1113690236
akpm@osdl.org
692880
2d137c24e9f4
Sat Apr 16 15:23:55 2005 -0700
1113690235
akpm@osdl.org
692881
baaa2c512dc1
Sat Apr 16 15:23:54 2005 -0700
1113690234
Neil Brown
692882
8d38eadb7a97
Sat Apr 16 15:23:53 2005 -0700
1113690233
Christoph Lameter
692883
1da177e4c3f4
Sat Apr 16 15:20:36 2005 -0700
1113690036
Linus Torvalds
692884 rows × 4 columns
In [30]:
with open("../../linux/git.log", encoding="latin-1") as log_file:
log_file.read()
for i, line in enumerate(log_file.readline()):
print(i,line)
Content source: feststelltaste/software-analytics
Similar notebooks: