In [1]:
import pandas as pd
In [2]:
df = pd.DataFrame()
In [3]:
df
Out[3]:
In [4]:
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
df = pd.concat([df, chunk])
df.head(3)
Out[4]:
In [8]:
df = pd.DataFrame()
# let's figure out how to split out the 1st column's values
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
data_rows = [row for row in chunk.ix[:,0].str.split(',')]
data_cols = chunk.columns[0].split(',')
print(data_rows[:2], data_cols)
break;
df = pd.concat([df, chunk])
Looks like we are getting the right value split. However, the last column name split looks odd.
In [9]:
df = pd.DataFrame()
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
data_rows = [row for row in chunk.ix[:,0].str.split(',')]
# create a column split list generator
data_cols = [col.split('\\')[0] for col in chunk.columns[0].split(',')]
print(data_rows[:2], data_cols)
break;
df = pd.concat([df, chunk])
The last column name is now called "geo" -- simple and elegant.
Now, we need to merge the list of lists to create the cleaned dataframe:
In [10]:
df = pd.DataFrame()
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
data_rows = [row for row in chunk.ix[:,0].str.split(',')]
# create a column split list generator
data_cols = [col.split('\\')[0] for col in chunk.columns[0].split(',')]
clean_df = pd.DataFrame(data_rows, columns=data_cols)
# now we can concat by "column" which means axis=1
new_df = pd.concat([clean_df, chunk], axis=1)
print(new_df)
break;
df = pd.concat([df, chunk])
We notice that we still have our own column[0] around -- let's drop it to clean up the DF.
In [12]:
df = pd.DataFrame()
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
data_rows = [row for row in chunk.ix[:,0].str.split(',')]
# create a column split list generator
data_cols = [col.split('\\')[0] for col in chunk.columns[0].split(',')]
clean_df = pd.DataFrame(data_rows, columns=data_cols)
# now we can concat by "column" which means axis=1
new_df = pd.concat([clean_df,
chunk.drop(chunk.columns[0], axis=1)], axis=1)
print(new_df)
break;
df = pd.concat([df, chunk])
This looks a lot cleaner!
We are finally ready to run this for all input data:
In [30]:
df = pd.DataFrame()
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
data_rows = [row for row in chunk.ix[:,0].str.split(',')]
# create a column split list generator
data_cols = [col.split('\\')[0] for col in chunk.columns[0].split(',')]
clean_df = pd.DataFrame(data_rows, columns=data_cols)
# now we can concat by "column" which means axis=1
new_df = pd.concat([clean_df,
chunk.drop(chunk.columns[0], axis=1)], axis=1)
df = pd.concat([df, new_df])
df.head(3)
Out[30]:
We have concat data both horizontally and vertically, and able to clean up the column name as well as transforming a messy column by splitting down its components.
In [35]:
df.shape
Out[35]:
In [33]:
df.describe(include='all')
Out[33]:
In [38]:
df.loc[df['sitc06']=='TOTAL']
Out[38]:
In [ ]: