This was the best 'serious' submission I did, and would have gotten #29 if I had picked it...


In [1]:
import kagglegym
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import xgboost as xgb
import math

import time

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge

from operator import itemgetter

# train/public split, use for 
config_local = {'localrun': True, 'usepublic': True, 'vmode': False, 'all_features': False}

# validation mode (random non-stratified 80/20 split)
# in *some* versions vmode will auto-stop, others will need xgbcap
config_vmode = {'localrun': True, 'usepublic': False, 'vmode': True, 'all_features': False}

# just before submission run
config_presubmit = {'localrun': True, 'usepublic': True, 'vmode': False, 'all_features': False}

# submission run
config_submit = {'localrun': False, 'usepublic': False, 'vmode': False, 'all_features': False}

config = config_presubmit
xgbcap = 200 # reduce after a t/p run for vmode/LB runs

# I used individual variables earlier, fix later? ;)
localrun = config['localrun']
usepublic = config['usepublic']
vmode = config['vmode']
all_features = config['all_features'] # use w/vmode for feature selection.  run twice, cutting # of rounds to peak round

# This is taken from Frans Slothoubers post on the contest discussion forum.
# https://www.kaggle.com/slothouber/two-sigma-financial-modeling/kagglegym-emulation

def r_score(y_true, y_pred, sample_weight=None, multioutput=None):
    r2 = r2_score(y_true, y_pred, sample_weight=sample_weight,
                  multioutput=multioutput)
    r = (np.sign(r2)*np.sqrt(np.abs(r2)))
    if r <= -1:
        return -1
    else:
        return r

# From the xgboost script (along with the param settings)
    
# Function XGBOOST ########################################################
def xgb_obj_custom_r(y_pred, dtrain):
    y_true = dtrain.get_label()
    y_mean = np.mean(y_true)
    y_median = np.median(y_true)
    c1 = y_true
    #c1 = y_true - y_mean
    #c1 = y_true - y_median
    grad = 2*(y_pred-y_true)/(c1**2)
    hess = 2/(c1**2)
    return grad, hess

def xgb_eval_custom_r(y_pred, dtrain):
    #y_pred = np.clip(y_pred, -0.075, .075)
#    y_pred[y_pred > .075] = .075
#    y_pred[y_pred < -.075] = -.075
    y_true = dtrain.get_label()
    ybar = np.sum(y_true)/len(y_true)
    ssres = np.sum((y_true - y_pred) ** 2)
    sstot = np.sum((y_true - ybar)**2)
    r2 = 1 - ssres/sstot
    error = np.sign(r2) * np.absolute(r2)**0.5
    return 'error', error

env = kagglegym.make()
o = env.reset()

#excl = [env.ID_COL_NAME, env.SAMPLE_COL_NAME, env.TARGET_COL_NAME, env.TIME_COL_NAME]
excl = ['id', 'sample', 'y', 'timestamp']
basecols = [c for c in o.train.columns if c not in excl]


#rcol_orig = ['Dtechnical_20', 'y_prev_pred_avg_diff', 'Dtechnical_21', 'technical_43_prev', 'technical_20', 'y_prev_pred_avgT0', 'y_prev_pred_mavg5', 'y_prev_pred_avgT1', 'fundamental_8_prev', 'Dtechnical_40', 'technical_7_prev', 'technical_7', 'fundamental_5', 'Dtechnical_30', 'technical_32_prev', 'technical_14_prev', 'fundamental_1', 'fundamental_43_prev', 'Dfundamental_22', 'Dtechnical_35', 'Dtechnical_6', 'Dtechnical_17', 'Dtechnical_27', 'Dfundamental_42', 'fundamental_1_prev', 'Dtechnical_0', 'technical_40', 'technical_40_prev', 'fundamental_36', 'Dfundamental_33', 'Dfundamental_48', 'technical_27_prev', 'fundamental_62_prev', 'fundamental_41_prev', 'Dfundamental_50', 'fundamental_48', 'derived_2_prev', 'Dtechnical_18', 'fundamental_35', 'Dfundamental_49', 'fundamental_26_prev', 'technical_28_prev', 'Dfundamental_63', 'fundamental_10_prev', 'fundamental_36_prev', 'fundamental_16', 'Dfundamental_8', 'fundamental_32', 'fundamental_40_prev', 'derived_0', 'Dfundamental_32', 'fundamental_17', 'Dtechnical_7', 'fundamental_25', 'technical_35', 'Dtechnical_19', 'technical_35_prev', 'fundamental_8', 'Dtechnical_32', 'Dfundamental_18', 'Dtechnical_37', 'fundamental_33_prev', 'Dtechnical_28', 'fundamental_46', 'Dfundamental_1', 'Dfundamental_45', 'fundamental_18', 'technical_12', 'technical_44', 'fundamental_22', 'Dtechnical_5', 'technical_17_prev', 'Dfundamental_25']
rcol_orig = ['y_prev_pred', 'y_prev_pred_mstd5', 'Dtechnical_20', 'technical_43_prev', 'technical_7', 'y_prev_pred_avg_diff', 'y_prev_pred_avgT0', 'y_prev_pred_mavg5', 'technical_7_prev', 'technical_20', 'technical_40_prev', 'y_prev_pred_avgT1', 'Dtechnical_40', 'Dtechnical_30', 'technical_40', 'fundamental_36_prev', 'fundamental_5', 'fundamental_8_prev', 'technical_35', 'Dtechnical_21', 'fundamental_36', 'fundamental_43_prev', 'fundamental_46', 'fundamental_18', 'Dtechnical_35', 'Dtechnical_0', 'Dfundamental_45', 'fundamental_48', 'fundamental_1_prev', 'Dtechnical_27', 'Dfundamental_50', 'Dfundamental_18', 'fundamental_16', 'Dfundamental_48', 'Dtechnical_6', 'fundamental_40_prev', 'fundamental_26_prev', 'Dfundamental_8', 'Dtechnical_19', 'fundamental_25', 'fundamental_8', 'fundamental_10_prev', 'technical_35_prev', 'technical_14_prev', 'fundamental_1', 'Dtechnical_37', 'Dfundamental_49', 'Dtechnical_18', 'Dfundamental_42', 'fundamental_41_prev', 'fundamental_62_prev', 'technical_12', 'technical_17_prev', 'technical_27_prev', 'Dtechnical_17', 'derived_0', 'fundamental_33_prev', 'fundamental_32', 'fundamental_17', 'Dtechnical_32', 'technical_32_prev', 'Dfundamental_22', 'fundamental_22', 'fundamental_35', 'Dfundamental_32', 'Dtechnical_7', 'Dfundamental_1', 'technical_28_prev', 'Dtechnical_28', 'Dfundamental_25', 'Dfundamental_63', 'Dtechnical_5', 'technical_44', 'Dfundamental_33', 'derived_2_prev']
rcol = rcol_orig.copy()

if all_features:
    rcol = []
    for c in basecols:
        rcol.append(c)
        rcol.append(c + '_prev')
        rcol.append('D' + c)
        
    rcol += ['y_prev_pred_avg_diff', 'y_prev_pred_avgT0', 'y_prev_pred_mavg5', 'y_prev_pred_avgT1']
    rcol_orig = rcol.copy()

backy_fset = ['technical_13', 'technical_20', 'technical_13_prev', 'technical_20_prev', 'technical_30_prev', 'technical_30']

backy_fset = []
for c in ['technical_13', 'technical_20', 'technical_30']:
    backy_fset.append(c)
    backy_fset.append(c + '_prev')
    backy_fset.append('D' + c)

#backy_fset =  ['technical_20', 'Dtechnical_20', 'technical_20_prev', 'Dtechnical_43', 'Dtechnical_7', 'technical_40', 'Dtechnical_40', 'technical_30', 'Dtechnical_30', 'technical_30_prev', 'Dfundamental_5', 'fundamental_8', 'Dfundamental_8', 'Dtechnical_35', 'technical_21', 'Dtechnical_21', 'technical_21_prev', 'Dfundamental_43', 'technical_0', 'Dtechnical_0', 'Dtechnical_27', 'Dfundamental_16', 'technical_6', 'Dtechnical_6', 'fundamental_40', 'Dfundamental_40', 'fundamental_40_prev', 'Dtechnical_19', 'Dtechnical_14', 'Dtechnical_37', 'technical_37_prev', 'Dfundamental_49', 'Dtechnical_18', 'Dfundamental_62', 'Dtechnical_12', 'technical_17', 'Dtechnical_17', 'technical_32', 'Dtechnical_32', 'technical_32_prev', 'Dfundamental_22', 'Dfundamental_35', 'technical_28', 'Dtechnical_28', 'technical_28_prev', 'fundamental_63', 'Dfundamental_63', 'technical_5', 'Dtechnical_5', 'technical_5_prev', 'technical_44', 'Dtechnical_44', 'technical_44_prev', 'technical_13', 'Dtechnical_13', 'technical_13_prev']
for f in backy_fset:
    if f not in rcol:
        rcol.append(f)

def get_basecols(rcol):
    duse = {}

    for r in rcol:
        if 'y' in r:
            continue

        if 'D' in r:
            duse[r[1:]] = True
        elif '_prev' in r:
            duse[r[:-5]] = True
        elif r in basecols:
            duse[r] = True

    return [k for k in duse.keys()]

basecols_touse = get_basecols(rcol)

if vmode:
    train = pd.read_hdf('../input/train.h5')
else:
    train = o.train.copy()

d_mean = o.train[basecols_touse].median(axis=0)
for c in basecols_touse:
    d_mean[c + '_prev'] = d_mean[c]
    d_mean['D' + c] = 0

median = {t[0]:t[1] for t in zip(d_mean.index, d_mean.values)}
median['y'] = 0

print('processed medians')

class DataPrep:
    
    def __init__(self, yprev_model = None, keepinput = True, cols = rcol, seed = 2017):
        self.previnput = None
        self.prevavg = 0
        self.cols = cols.copy()
        
        self.basecols = get_basecols(self.cols)
        self.keepcols = ['y', 'id', 'timestamp'] + self.basecols
        
        self.allinput = [] if keepinput else None
        
        self.dayavg = []
        
        self.yprev_model = yprev_model
        
        np.random.seed(seed)
        self.random_state = np.random.get_state()
        
    def procday(self, day_in):
        
        if 'y' not in day_in and 'y' in self.keepcols:
            self.keepcols.remove('y')
        
        day = day_in[self.keepcols].copy()
        
        notinnew = []
        
        if self.previnput is not None:
            olen = len(day)
            day = pd.merge(day, self.previnput, on='id', how = 'left', suffixes=['', '_prev'])
            notinnew = self.previnput[~self.previnput.id.isin(day_in.id)].copy()
            #print(day.iloc[0].timestamp, len(notinnew))
        else:
            for c in self.basecols:
                day[c + '_prev'] = np.full_like(day[c], 0, dtype=np.float32)
                #day[c + '_prev'] = np.zeros_like(day[c], dtype=np.float32)
        
        for c in self.cols:
            if c == 'y_prev_pred':
                continue

            if c[0] == 'D':
                day[c] = day[c[1:]] - day[c[1:] + '_prev']
                
        self.previnput = day_in[self.keepcols].copy()
        if len(notinnew) > 0:
            self.previnput = self.previnput.append(notinnew[self.keepcols])
        
        if self.yprev_model:
            day['y_prev_pred'] = self.yprev_model.predict(day[backy_fset].fillna(d_mean).values.reshape(-1,len(backy_fset)))

            avg = day.y_prev_pred.mean()

            self.dayavg.append(avg)
            day['y_prev_pred_mavg5'] = np.ma.average(np.array(self.dayavg[-5:]))#, weights=range(1, len(self.dayavg[-10:]) + 1))
            day['y_prev_pred_min5'] = day.y_prev_pred - day.y_prev_pred_mavg5
            day['y_prev_pred_mavg5d'] = avg - np.ma.average(np.array(self.dayavg[-5:]))
            
            day['y_prev_pred_mstd5'] = np.std(np.array(self.dayavg[-5:]))
            
            day['y_prev_pred_mavg9'] = np.ma.average(np.array(self.dayavg[-9:]))#, weights=range(1, len(self.dayavg[-10:]) + 1))
            day['y_prev_pred_mavg20'] = np.ma.average(np.array(self.dayavg[-20:]))
            day['y_prev_pred_mavg40'] = np.ma.average(np.array(self.dayavg[-40:]))
            
            day['y_prev_pred_avgT1'] = self.prevavg
            day['y_prev_pred_avgT0'] = avg
            day['y_prev_pred_avg_diff'] = avg - self.prevavg

            self.prevavg = avg
            
        np.random.set_state(self.random_state)
        day['random'] = np.random.random(len(day))
        self.random_state = np.random.get_state()
            
        if self.allinput is not None:
            self.allinput.append(day.copy())

        return day
    
    def run(self, df):
        assert self.allinput is not None
        
        for g in df.groupby('timestamp'):
            self.procday(g[1])
            
        return pd.concat(self.allinput)

yptrain = DataPrep(keepinput=True, cols=backy_fset).run(train)

#yptrain_prep = tmp.run(yptrain)

yptrain.sort_values(['id', 'timestamp'], inplace=True)

ypmodel = LinearRegression(n_jobs=-1)
low_y_cut = -0.0725
high_y_cut = 0.075

mask = np.logical_and(yptrain.y > low_y_cut, yptrain.y < high_y_cut)
#for f in backy_fset:
#    mask = np.logical_and(mask, ~yptrain[f].isnull())
    
yptraina = yptrain[mask]
ypmodel.fit(yptraina[backy_fset].fillna(d_mean).values.reshape(-1,len(backy_fset)), yptraina.y_prev.fillna(0))

print(len(yptraina), ypmodel.coef_, ypmodel.intercept_)

preds = ypmodel.predict(yptrain[backy_fset].fillna(d_mean).values.reshape(-1,len(backy_fset)))
print(r_score(yptrain.y_prev.fillna(0), preds))

d_mean['y'] = 0

start = time.time()

print('beginning train processing')
train = DataPrep(keepinput = True, yprev_model = ypmodel).run(train)

endt = time.time()
print(endt - start)

dcol = [c for c in train.columns if c not in excl]

if usepublic:
    data_all = pd.read_hdf('../input/train.h5')

    #public = data_all[data_all.timestamp > 905]
    allpublic = DataPrep(yprev_model = ypmodel, keepinput=True).run(data_all)
    public = DataPrep(yprev_model = ypmodel, keepinput=True).run(data_all[data_all.timestamp > 905])

print(r_score(train.y_prev.fillna(0), train.y_prev_pred))

if usepublic:
    print(r_score(public.y_prev.fillna(0), public.y_prev_pred))

train.sort_values(['id', 'timestamp'], inplace=True)

print('prepping xgb now')
#xtrain, xvalid = train_test_split(train, test_size = 0.2, random_state = 2017)

if not vmode: # submission-style, use all data
    rthreshold = 1.0
    xtmask = train.random <= rthreshold
    xtmask = np.logical_and(xtmask, train['y'] > -.0189765)
    xtmask = np.logical_and(xtmask, train['y'] < .0189765)

    xtrain = train[xtmask]
    xvalid = train[~xtmask]
else: # use older split to compare with say the .0202/.0203 subs
    xtrain, xvalid = train_test_split(train, test_size = 0.2, random_state = 2017)

    xtrain = xtrain[np.abs(xtrain['y']) < 0.018976588919758796]
    

cols_to_use = [c for c in rcol if c in xtrain.columns and c in rcol_orig] 

                                                      
# Convert to XGB format
to_drop = ['timestamp', 'y']

train_xgb = xgb.DMatrix(data=xtrain[cols_to_use], label=xtrain['y'])
valid_xgb = xgb.DMatrix(data=xvalid[cols_to_use], label=xvalid['y'])

evallist = [(valid_xgb, 'valid'), (train_xgb, 'train')]

if usepublic:
    public_xgb = xgb.DMatrix(data=public[cols_to_use], label=public['y'])

    evallist = [(train_xgb, 'train'), (valid_xgb, 'xvalid'), (public_xgb, 'public')]

print('xtrain+valid')

params = {
    'objective': 'reg:linear'
    ,'eta': 0.04
    ,'max_depth': 3
    , 'subsample': 0.9
    #, 'colsample_bytree': 1
    ,'min_child_weight': 3072 # 2 **11
    #,'gamma': 100
    , 'seed': 10
    #, 'base_score': xtrain.y.mean()
}

model = []
for seed in [10000]:
    params['seed'] = seed
    model.append(xgb.train(params.items()
                  , dtrain=train_xgb
                  , num_boost_round=xgbcap # 240 was best_ntree_limit from a train/public split run
                  , evals=evallist
                  , early_stopping_rounds=50
                  , maximize=True
                  , verbose_eval=10
                  , feval=xgb_eval_custom_r
                  ))

if not localrun:
    del train_xgb
    del valid_xgb
    if usepublic:
        del public_xgb

print('xgb done, linear now')

lin_features = ['Dtechnical_20', 'technical_20', 'Dtechnical_21']

def prep_linear(df, c = lin_features):
    df_tmp = df.fillna(d_mean)
    m2mat = np.zeros((len(df), len(c)))
    for i in range(len(c)):
        m2mat[:,i] = df_tmp[c[i]].values
    
    return m2mat

# Observed with histograns:
#https://www.kaggle.com/bguberfain/two-sigma-financial-modeling/univariate-model-with-clip/run/482189
low_y_cut = -0.075
high_y_cut = 0.075
traincut = train[np.logical_and(train.y > low_y_cut, train.y < high_y_cut)][['y'] + lin_features].copy().fillna(d_mean)

model2 = LinearRegression(n_jobs=-1)
model2.fit(prep_linear(traincut), traincut.y)

print('linear done')

def update_model(m, params_in, cols_to_use):

    params = params_in.copy()
    
    params.update({'process_type': 'update',
                       'updater'     : 'refresh',
                       'refresh_leaf': False})

    m_train = xgb.train(params, train_xgb, m.best_ntree_limit, xgb_model=m)
    m_test = xgb.train(params, public_xgb, m.best_ntree_limit, xgb_model=m)

    imp = pd.DataFrame(index=cols_to_use)
    imp['train'] = pd.Series(m_train.get_score(importance_type='gain'), index=cols_to_use)
#    imp['valid'] = pd.Series(m_valid.get_score(importance_type='gain'), index=cols_to_use)
    imp['test'] = pd.Series(m_test.get_score(importance_type='gain'), index=cols_to_use)

    imp = imp.fillna(0)
    
    return m_train, m_test, imp

if vmode:
    preds_xgb = model[0].predict(valid_xgb, ntree_limit=model[0].best_ntree_limit)
    preds_linear = model2.predict(prep_linear(xvalid))
    
    preds = (preds_xgb * 0.7) + (preds_linear * 0.3)
    #preds = preds_xgb
    
    rs = kagglegym.r_score(xvalid.y, preds)
    
    ID = 'expv-{0}.pkl'.format(int(rs * 10000000))
    print(rs, ID)
    
    #ID = 'subv-203172.pkl' # if actual submission
    
    output = xvalid[['id', 'timestamp', 'y']].copy()
    output['y_hat'] = preds
    output['y_hat_xgb'] = preds_xgb
    output['y_hat_linear'] = preds_linear
    
    output.to_pickle(ID)

if all_features:
    m = model[0]

    fs = m.get_fscore()
    fsl = [(f,fs[f]) for f in fs.keys()]
    fsl = sorted(fsl, key=itemgetter(1), reverse=True)

    print(len(fsl))

    #print('rcol =', [f[0] for f in fsl])
    
    m_train, m_test, imp = update_model(model[0], params, cols_to_use)

    imp['train_test'] = imp.train * imp.test

    imp = imp.sort_values('train_test', ascending=False)
    imp.train = imp.train / imp.train.max()
    imp.test = imp.test / imp.test.max()
    
    impr = imp[imp.train_test > 0]
    
    print('rcol_orig =', list(impr.index))


#len(impr), len(imp)

start = time.time()        

dprep = DataPrep(yprev_model = ypmodel, keepinput=localrun)
        

if localrun:
    env = kagglegym.make()
    o = env.reset()

while True:
    test_preproc = o.features.copy()
    
    #if c in basecols:
        #test_preproc.fillna(d_mean, inplace=True)
    
    test = dprep.procday(test_preproc)
    
    #test.fillna(0, inplace=True)
    
    test_xgb = xgb.DMatrix(data=test.drop(['id', 'timestamp'], axis=1)[cols_to_use])

    xgbpreds = np.zeros(len(test), dtype=np.float64)
    
    for m in model:
        xgbpreds += m.predict(test_xgb, ntree_limit=m.best_ntree_limit)

    xgbpreds /= len(model)
    
    test_y = (xgbpreds * .7) + (model2.predict(prep_linear(test)).clip(low_y_cut, high_y_cut) * 0.3)
    
    o.target['y'] = test_y
    target = o.target
    
    timestamp = o.features["timestamp"][0]
    if timestamp % 100 == 0:
        print("Timestamp #{0} {1}".format(timestamp, time.time() - start))
        start = time.time()

    # We perform a "step" by making our prediction and getting back an updated "observation":
    o, reward, done, info = env.step(target)
    if done:
        print("Public score: {}".format(info["public_score"]))
        break


processed medians
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2133             try:
-> 2134                 return self._engine.get_loc(key)
   2135             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'Dtechnical_30'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py in set(self, item, value, check)
   3667         try:
-> 3668             loc = self.items.get_loc(item)
   3669         except KeyError:

/opt/conda/lib/python3.6/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'Dtechnical_30'

During handling of the above exception, another exception occurred:

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-1-323ae8ae05e6> in <module>()
    233         return pd.concat(self.allinput)
    234 
--> 235 yptrain = DataPrep(keepinput=True, cols=backy_fset).run(train)
    236 
    237 #yptrain_prep = tmp.run(yptrain)

<ipython-input-1-323ae8ae05e6> in run(self, df)
    229 
    230         for g in df.groupby('timestamp'):
--> 231             self.procday(g[1])
    232 
    233         return pd.concat(self.allinput)

<ipython-input-1-323ae8ae05e6> in procday(self, day_in)
    188 
    189             if c[0] == 'D':
--> 190                 day[c] = day[c[1:]] - day[c[1:] + '_prev']
    191 
    192         self.previnput = day_in[self.keepcols].copy()

/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   2417         else:
   2418             # set column
-> 2419             self._set_item(key, value)
   2420 
   2421     def _setitem_slice(self, key, value):

/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   2484         self._ensure_valid_index(value)
   2485         value = self._sanitize_column(key, value)
-> 2486         NDFrame._set_item(self, key, value)
   2487 
   2488         # check if we are modifying a copy

/opt/conda/lib/python3.6/site-packages/pandas/core/generic.py in _set_item(self, key, value)
   1498 
   1499     def _set_item(self, key, value):
-> 1500         self._data.set(key, value)
   1501         self._clear_item_cache()
   1502 

/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py in set(self, item, value, check)
   3669         except KeyError:
   3670             # This item wasn't present, just insert at end
-> 3671             self.insert(len(self.items), item, value)
   3672             return
   3673 

/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py in insert(self, loc, item, value, allow_duplicates)
   3767 
   3768         # insert to the axis; this could possibly raise a TypeError
-> 3769         new_axis = self.items.insert(loc, item)
   3770 
   3771         block = make_block(values=value, ndim=self.ndim,

/opt/conda/lib/python3.6/site-packages/pandas/indexes/base.py in insert(self, loc, item)
   3238         item = self._coerce_scalar_to_index(item)._values
   3239         idx = np.concatenate((_self[:loc], item, _self[loc:]))
-> 3240         return self._shallow_copy_with_infer(idx)
   3241 
   3242     def drop(self, labels, errors='raise'):

/opt/conda/lib/python3.6/site-packages/pandas/indexes/base.py in _shallow_copy_with_infer(self, values, **kwargs)
    400             except (TypeError, ValueError):
    401                 pass
--> 402         return Index(values, **attributes)
    403 
    404     def _deepcopy_if_needed(self, orig, copy=False):

/opt/conda/lib/python3.6/site-packages/pandas/indexes/base.py in __new__(cls, data, dtype, copy, name, fastpath, tupleize_cols, **kwargs)
    245                 subarr = data.astype('object')
    246             else:
--> 247                 subarr = _asarray_tuplesafe(data, dtype=object)
    248 
    249             # _asarray_tuplesafe does not always copy underlying data,

/opt/conda/lib/python3.6/site-packages/pandas/core/common.py in _asarray_tuplesafe(values, dtype)
    371         return values.values
    372 
--> 373     if isinstance(values, list) and dtype in [np.object_, object]:
    374         return lib.list_to_object_array(values)
    375 

KeyboardInterrupt: 

In [ ]: