In [1]:
import os
import sys
import pandas as pd
sys.path.insert(1,'C:/Users/Reid/projects/prospecting/repo/prospecting') # Hack to point to prospecting package
import prospecting as p
In [2]:
# Initialize spreadsheet
# View spreadsheet here: https://docs.google.com/spreadsheets/d/17R9V5tefzFzMXBi2i9SOybhqwzF7PSlse9OO99BfDxQ
ss = p.SheetsApi(spreadsheetid = '17R9V5tefzFzMXBi2i9SOybhqwzF7PSlse9OO99BfDxQ',
scopelist=['https://www.googleapis.com/auth/spreadsheets',
'https://www.googleapis.com/auth/drive.metadata'])
ss.authenticate()
ss.info = ss.get_ss_info()
#ss.sheets = ss.load_sheets(['metadata', 'raw_descr', 'clean_descr', 'dummified_descr']) # loads sheets as pandas DataFrames
In [3]:
# Load raw dataset
innocentive_csv_dir = os.path.join(os.environ['HOME'], 'data', 'innocentive', 'csv')
SOURCE_CSV = os.path.join(innocentive_csv_dir, 'InnoCentive_Challenge_9933493_training_data.csv')
df_raw = pd.read_csv(SOURCE_CSV,
index_col='record_ID',
decimal='.',
thousands=",")
df_raw.__name__ = 'df_raw'
In [4]:
# Create description of df_raw and update `raw_descr` sheet in spreadsheet
raw_descr = p.create_description(df_raw)
raw_descr_sheet_range = "".join(['raw_descr!E:', (chr(ord('E') + raw_descr.shape[1] - 1))])
ss.update(raw_descr, raw_descr_sheet_range)
ss.sheets['raw_descr'] = ss.get('raw_descr')
At this point, I check out the
raw_descr
sheet and (at a bare minimum) set thekeep_flag
to indicate whether or not I want to drop the column when processing the dataset. I may also revisit options passed topd.read_csv()
as needed (ex. indicate which column should be used as the index).Next, I populate the
metadata
sheet by first ensuring that all columns from theraw_descr
sheet are listed undercolumn_name
, and then populate the configuration details for each column.Once I am satisfied with the configuration in the
metadata
sheet, I load the metadata so that I can pass the most up-to-date processing logic to theclean_data
function.
In [5]:
# Get most up-to-date processing logic as configured in the metadata sheet
md = ss.get('metadata', headerrow=1)
In [6]:
# clean df_raw, describe df_clean, create custom sheet range to control where data is written in sheet
df_clean = p.process.clean_data(df_raw, md)
clean_descr = p.create_description(df_clean)
clean_descr_sheet_range = "".join(['clean_descr!A:', (chr(ord('A') + raw_descr.shape[1] - 1))])
# For the sheet range, you can also just pass the sheet name (ex. 'clean_descr')
# if you don't need finite control over where data is written within a sheet
In [7]:
ss.update(clean_descr, clean_descr_sheet_range) # Check `clean_descr` sheet in spreadsheet to see df_clean description
ss.sheets['clean_descr'] = ss.get('clean_descr')
In [8]:
innocentive_dummified = pd.get_dummies(df_clean)
print(innocentive_dummified.shape)
In [9]:
innocentive_dummified.__name__ = 'innocentive_dummified'
innocentive_dummified_descr = p.create_description(innocentive_dummified)
ss.update(innocentive_dummified_descr, 'dummified_descr')
ss.sheets['dummified_descr'] = ss.get('dummified_descr')
In [10]:
# use a 15% subset of training data to use as an example
subset = innocentive_dummified.sample(n=(int(innocentive_dummified.shape[0]*.15)))
subset.shape
Out[10]:
In [11]:
# Pickle dataset for later use
p.utils.pckl(subset, "innocentive_dummified")
In [ ]:
#SOURCE_CSV = os.path.join(os.environ['DATADIR'], 'data.csv.gz')
#SOURCE_CSV = "https://storage.cloud.google.com/prospecting/data/innocentive/InnoCentive_Challenge_9933493_training_data.csv.gz"