In [1]:
%%time
import pandas as pd

dfs = pd.read_csv(
    "../../linux/git_diff.log",
    skip_blank_lines=False,
    sep="\n",
    encoding="latin-1",
    chunksize=100000,
    names=["raw"])
df = pd.concat(dfs)
df.tail()


Wall time: 2min 33s

In [2]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112519384 entries, 0 to 112519383
Data columns (total 1 columns):
raw    object
dtypes: object(1)
memory usage: 858.5+ MB