Pandas for managing Datasets



In [1]:

    
import pandas as pd

Create an empty dataframe for adding in datasets



In [3]:

    
df = pd.DataFrame()
df









    Out[3]:

Importing a large dataset in chunks of 100 bytes



In [4]:

    
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
    df = pd.concat([df, chunk])



In [5]:

    
df.head()









    Out[5]:






  
    
      
      partner,currency,stk_flow,sitc06,geo\time
      2014
      2012
      2010
    
  
  
    
      0
      EXT_EU,EUR,EXP,SITC0-4A,AT
      61.9
      65.6
      67
    
    
      1
      EXT_EU,EUR,EXP,SITC0-4A,BE
      53.8
      85.8
      92.4
    
    
      2
      EXT_EU,EUR,EXP,SITC0-4A,BG
      57.0
      46.2
      54.1
    
    
      3
      EXT_EU,EUR,EXP,SITC0-4A,CY
      79.1
      60.7
      61.4
    
    
      4
      EXT_EU,EUR,EXP,SITC0-4A,CZ
      58.3
      66.7
      59.1

Notice that column 2 looks like can be further transformed during the loading phase.



In [11]:

    
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
    # separate out all the data rows for column index 0
    data_rows = [row for row in chunk.ix[:,0].str.split(',')]
    
    # show me column indexed 0 and split on ","
    data_cols = [col.split('\\')[0] for col in chunk.columns[0].split(',')]
    
    # create new data frame with these new rows and columns
    clean_df = pd.DataFrame(data_rows, columns=data_cols)
    new_df = pd.concat([clean_df, chunk.drop(chunk.columns[0], axis=1)], 
                       axis=1)
    
    # sanity check for top 5 rows
    print new_df.head()
    break
    df = pd.concat([df, chunk])









    



  partner currency stk_flow    sitc06 geo  2014   2012   2010 
0  EXT_EU      EUR      EXP  SITC0-4A  AT   61.9  65.6     67 
1  EXT_EU      EUR      EXP  SITC0-4A  BE   53.8  85.8   92.4 
2  EXT_EU      EUR      EXP  SITC0-4A  BG   57.0  46.2   54.1 
3  EXT_EU      EUR      EXP  SITC0-4A  CY   79.1  60.7   61.4 
4  EXT_EU      EUR      EXP  SITC0-4A  CZ   58.3  66.7   59.1



In [12]:

    
for chunk in pd.read_csv('data/ext_lt_invcur.tsv', sep='\t', chunksize=100):
    # separate out all the data rows for column index 0
    data_rows = [row for row in chunk.ix[:,0].str.split(',')]
    
    # show me column indexed 0 and split on ","
    data_cols = [col.split('\\')[0] for col in chunk.columns[0].split(',')]
    
    # create new data frame with these new rows and columns
    clean_df = pd.DataFrame(data_rows, columns=data_cols)
    new_df = pd.concat([clean_df, chunk.drop(chunk.columns[0], axis=1)], 
                       axis=1)
    
    df = pd.concat([df, new_df])



In [ ]:

	partner,currency,stk_flow,sitc06,geo\time	2014	2012	2010
0	EXT_EU,EUR,EXP,SITC0-4A,AT	61.9	65.6	67
1	EXT_EU,EUR,EXP,SITC0-4A,BE	53.8	85.8	92.4
2	EXT_EU,EUR,EXP,SITC0-4A,BG	57.0	46.2	54.1
3	EXT_EU,EUR,EXP,SITC0-4A,CY	79.1	60.7	61.4
4	EXT_EU,EUR,EXP,SITC0-4A,CZ	58.3	66.7	59.1