In [1]:
import os
import sys
import pandas as pd

sys.path.insert(1,'C:/Users/Reid/projects/prospecting/repo/prospecting')  # Hack to point to prospecting package
import prospecting as p

In [2]:
# Initialize spreadsheet
# View spreadsheet here:  https://docs.google.com/spreadsheets/d/17R9V5tefzFzMXBi2i9SOybhqwzF7PSlse9OO99BfDxQ
ss = p.SheetsApi(spreadsheetid = '17R9V5tefzFzMXBi2i9SOybhqwzF7PSlse9OO99BfDxQ',
                 scopelist=['https://www.googleapis.com/auth/spreadsheets',
                            'https://www.googleapis.com/auth/drive.metadata'])
ss.authenticate()
ss.info = ss.get_ss_info()
#ss.sheets = ss.load_sheets(['metadata', 'raw_descr', 'clean_descr', 'dummified_descr'])  # loads sheets as pandas DataFrames


prospecting.api: INFO     Reading discovery file for sheets:v4
prospecting.api: INFO     Authenticating...sheets, v4
prospecting.api: INFO     Getting credentials...
prospecting.api: INFO     Building service object...
prospecting.api: INFO     Service object built...<googleapiclient.discovery.Resource object at 0x0000016A384944A8>
prospecting.api: INFO     Successfully authenticated...sheets, v4
prospecting.api: INFO     Spreadsheet loaded.
prospecting.api: INFO     Sheets include: ['metadata', 'raw_descr', 'clean_descr', 'dummified_descr']

In [3]:
# Load raw dataset
innocentive_csv_dir = os.path.join(os.environ['HOME'], 'data', 'innocentive', 'csv')
SOURCE_CSV = os.path.join(innocentive_csv_dir, 'InnoCentive_Challenge_9933493_training_data.csv')

df_raw = pd.read_csv(SOURCE_CSV,
                     index_col='record_ID',
                     decimal='.',
                     thousands=",")

df_raw.__name__ = 'df_raw'

In [4]:
# Create description of df_raw and update `raw_descr` sheet in spreadsheet
raw_descr = p.create_description(df_raw)
raw_descr_sheet_range = "".join(['raw_descr!E:', (chr(ord('E') + raw_descr.shape[1] - 1))])
ss.update(raw_descr, raw_descr_sheet_range)
ss.sheets['raw_descr'] = ss.get('raw_descr')


prospecting.report: INFO     Creating description for df_raw
prospecting.report: INFO     Success! "column_name" added to description at column position [0]
prospecting.report: INFO     Success! "dtype" added to description at column position [1]
prospecting.report: INFO     Success! "missing_count" added to description at column position [4]
prospecting.report: INFO     Description created for df_raw
prospecting.api: INFO     Update Successful!

Interlude...

At this point, I check out the raw_descr sheet and (at a bare minimum) set the keep_flag to indicate whether or not I want to drop the column when processing the dataset. I may also revisit options passed to pd.read_csv() as needed (ex. indicate which column should be used as the index).

Next, I populate the metadata sheet by first ensuring that all columns from the raw_descr sheet are listed under column_name, and then populate the configuration details for each column.

Once I am satisfied with the configuration in the metadata sheet, I load the metadata so that I can pass the most up-to-date processing logic to the clean_data function.



In [5]:
# Get most up-to-date processing logic as configured in the metadata sheet
md = ss.get('metadata', headerrow=1)

In [6]:
# clean df_raw, describe df_clean, create custom sheet range to control where data is written in sheet
df_clean = p.process.clean_data(df_raw, md)
clean_descr = p.create_description(df_clean)
clean_descr_sheet_range = "".join(['clean_descr!A:', (chr(ord('A') + raw_descr.shape[1] - 1))])
# For the sheet range, you can also just pass the sheet name (ex. 'clean_descr')
# if you don't need finite control over where data is written within a sheet


prospecting.process: INFO     Cleaning data...
prospecting.process: INFO     Dropping rows with NA for columns []
prospecting.process: INFO     Converting C3 int to object...
prospecting.process: INFO     Filling columns with NA
prospecting.process: INFO     Converting string to int columns
prospecting.process: INFO     Data cleaned...
prospecting.report: INFO     Creating description for df_clean
prospecting.report: INFO     Success! "column_name" added to description at column position [0]
prospecting.report: INFO     Success! "dtype" added to description at column position [1]
prospecting.report: INFO     Success! "missing_count" added to description at column position [4]
prospecting.report: INFO     Description created for df_clean

In [7]:
ss.update(clean_descr, clean_descr_sheet_range)  # Check `clean_descr` sheet in spreadsheet to see df_clean description
ss.sheets['clean_descr'] = ss.get('clean_descr')


prospecting.api: INFO     Update Successful!

In [8]:
innocentive_dummified = pd.get_dummies(df_clean)
print(innocentive_dummified.shape)


(642709, 308)

In [9]:
innocentive_dummified.__name__ = 'innocentive_dummified'
innocentive_dummified_descr = p.create_description(innocentive_dummified)
ss.update(innocentive_dummified_descr, 'dummified_descr')
ss.sheets['dummified_descr'] = ss.get('dummified_descr')


prospecting.report: INFO     Creating description for innocentive_dummified
prospecting.report: INFO     Success! "column_name" added to description at column position [0]
prospecting.report: INFO     Success! "dtype" added to description at column position [1]
prospecting.report: INFO     Success! "missing_count" added to description at column position [4]
prospecting.report: INFO     Description created for innocentive_dummified
prospecting.api: INFO     Update Successful!

In [10]:
# use a 15% subset of training data to use as an example
subset = innocentive_dummified.sample(n=(int(innocentive_dummified.shape[0]*.15)))
subset.shape


Out[10]:
(96406, 308)

In [11]:
# Pickle dataset for later use
p.utils.pckl(subset, "innocentive_dummified")


prospecting.utils: INFO     Pickled object to: C:\Users\Reid\projects\prospecting\repo\prospecting\data\innocentive_dummified_96406_308.p

APPENDIX


In [ ]:
#SOURCE_CSV = os.path.join(os.environ['DATADIR'], 'data.csv.gz')
#SOURCE_CSV = "https://storage.cloud.google.com/prospecting/data/innocentive/InnoCentive_Challenge_9933493_training_data.csv.gz"