In [1]:
import pandas as pd
Create an empty dataframe for adding in datasets
In [3]:
df = pd.DataFrame()
df
Out[3]:
Importing a large dataset in chunks of 100 bytes
In [4]:
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
df = pd.concat([df, chunk])
In [5]:
df.head()
Out[5]:
Notice that column 2 looks like can be further transformed during the loading phase.
In [11]:
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
# separate out all the data rows for column index 0
data_rows = [row for row in chunk.ix[:,0].str.split(',')]
# show me column indexed 0 and split on ","
data_cols = [col.split('\\')[0] for col in chunk.columns[0].split(',')]
# create new data frame with these new rows and columns
clean_df = pd.DataFrame(data_rows, columns=data_cols)
new_df = pd.concat([clean_df, chunk.drop(chunk.columns[0], axis=1)],
axis=1)
# sanity check for top 5 rows
print new_df.head()
break
df = pd.concat([df, chunk])
In [12]:
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
# separate out all the data rows for column index 0
data_rows = [row for row in chunk.ix[:,0].str.split(',')]
# show me column indexed 0 and split on ","
data_cols = [col.split('\\')[0] for col in chunk.columns[0].split(',')]
# create new data frame with these new rows and columns
clean_df = pd.DataFrame(data_rows, columns=data_cols)
new_df = pd.concat([clean_df, chunk.drop(chunk.columns[0], axis=1)],
axis=1)
df = pd.concat([df, new_df])
In [ ]: