Contest entry by Wouter Kimman

Strategy:

stacking and optimized selective use of the training data


In [81]:
from numpy.fft import rfft
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import pandas as pd
import timeit
from sqlalchemy.sql import text
from sklearn import tree
#from sklearn.model_selection import LeavePGroupsOut
from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
#import sherlock.filesystem as sfs
#import sherlock.database as sdb
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
def permute_facies_nr(predicted_super, predicted0, faciesnr):
    predicted=predicted0.copy()
    N=len(predicted)
    for ii in range(N):
        if predicted_super[ii]==1:
            predicted[ii]=faciesnr  
    return predicted

In [3]:
def binarify(dataset0, facies_nr):
    dataset=dataset0.copy()
    mask=dataset != facies_nr
    dataset[mask]=0
    mask=dataset == facies_nr
    dataset[mask]=1    
    return dataset

In [4]:
def make_balanced_binary(df_in, faciesnr, factor):
    df=df_in.copy()
    y=df['Facies'].values
    y0=binarify(y, faciesnr)
    df['Facies']=y0

    df1=df[df['Facies']==1]
    X_part1=df1.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
    y_part1=df1['Facies'].values
    N1=len(df1)

    df2=df[df['Facies']==0]
    X_part0=df2.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
    y_part0=df2['Facies'].values
    N2=len(df2)
    print "ratio now:"
    print float(N2)/float(N1)
    ratio_to_keep=factor*float(N1)/float(N2)
    print "ratio after:"
    print float(N2)/(factor*float(N1))
    dum1, X_part2, dum2, y_part2 = train_test_split(X_part0, y_part0, test_size=ratio_to_keep, random_state=42)

    tmp=[X_part1, X_part2]  
    X = pd.concat(tmp, axis=0)
    y = np.concatenate((y_part1, y_part2))
    return X, y

In [5]:
# output f1
#def phaseI_model(correct_facies_labels, features_blind,regime_train, correctA, go_B, clf, pred_array, pred_blind):   
#    clf.fit(regime_train,correctA)     
#    predicted_B = clf.predict(go_B)
#    pred_array = np.vstack((predicted_B, pred_array))    
#    predicted_blind1 = clf.predict(features_blind)
#    out_f1=metrics.f1_score(correct_facies_labels, predicted_blind1, average = 'micro')
#    print "f1 score on the prediction of blind"
#    print out_f1
#    pred_blind = np.vstack((predicted_blind1, pred_blind))
#    return pred_array, pred_blind
#
#def phaseI_model_scaled(correct_facies_labels, features_blind,regime_train, correctA, go_B, clf, pred_array, pred_blind):   
#    regime_train=StandardScaler().fit_transform(regime_train)
#    go_B=StandardScaler().fit_transform(go_B)
#    features_blind=StandardScaler().fit_transform(features_blind)
#    clf.fit(regime_train,correctA)     
#    predicted_B = clf.predict(go_B)
#    pred_array = np.vstack((predicted_B, pred_array))    
#    predicted_blind1 = clf.predict(features_blind)
#    out_f1=metrics.f1_score(correct_facies_labels, predicted_blind1, average = 'micro')
#    print "f1 score on the prediction of blind"
#    print out_f1
#    pred_blind = np.vstack((predicted_blind1, pred_blind))
#    return pred_array, pred_blind

In [50]:
def phaseI_model(regime_train, correctA, go_B, clf, pred_array, pred_blind, features_blind):      
    clf.fit(regime_train,correctA)     
    predicted_B = clf.predict(go_B)
    pred_array = np.vstack((predicted_B, pred_array))   
    predicted_blind1 = clf.predict(features_blind)
    pred_blind = np.vstack((predicted_blind1, pred_blind))    
    return pred_array, pred_blind

def phaseI_model_scaled(regime_train, correctA, go_B, clf, pred_array, pred_blind, features_blind):   
    regime_train=StandardScaler().fit_transform(regime_train)
    go_B=StandardScaler().fit_transform(go_B)
    features_blind=StandardScaler().fit_transform(features_blind)
    clf.fit(regime_train,correctA)     
    predicted_B = clf.predict(go_B)
    pred_array = np.vstack((predicted_B, pred_array))
    predicted_blind1 = clf.predict(features_blind)
    pred_blind = np.vstack((predicted_blind1, pred_blind))
    return pred_array, pred_blind

In [6]:
def create_structure_for_regimes(df):
    allfeats=['GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
    data_all = []
    for feat in allfeats:
        dff=df.groupby('Well Name').describe(percentiles=[0.1, 0.25, .5, 0.75, 0.9]).reset_index().pivot(index='Well Name', values=feat, columns='level_1')
        dff = dff.drop(['count'], axis=1)
        cols=dff.columns
        cols_new=[]
        for ii in cols:
            strin=feat + "_" + str(ii)
            cols_new.append(strin)
        dff.columns=cols_new 
        dff1=dff.reset_index()
        if feat=='GR':
            data_all.append(dff1)
        else:
            data_all.append(dff1.iloc[:,1:])
    data_all = pd.concat(data_all,axis=1)
    return data_all

This is the only feature engineering used:


In [7]:
def magic(df):
    df1=df.copy()
    b, a = signal.butter(2, 0.2, btype='high', analog=False)
    feats0=['GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
    #feats01=['GR','ILD_log10','DeltaPHI','PHIND']
    #feats01=['DeltaPHI']
    #feats01=['GR','DeltaPHI','PHIND']
    feats01=['GR',]
    feats02=['PHIND']
    #feats02=[]
    for ii in feats0:
        df1[ii]=df[ii]
        name1=ii + '_1'
        name2=ii + '_2'
        name3=ii + '_3'
        name4=ii + '_4'
        name5=ii + '_5'
        name6=ii + '_6'
        name7=ii + '_7'
        name8=ii + '_8'
        name9=ii + '_9'
        xx1 = list(df[ii])
        xx_mf= signal.medfilt(xx1,9)
        x_min1=np.roll(xx_mf, 1)
        x_min2=np.roll(xx_mf, -1)
        x_min3=np.roll(xx_mf, 3)
        x_min4=np.roll(xx_mf, 4)
        xx1a=xx1-np.mean(xx1)
        xx_fil = signal.filtfilt(b, a, xx1)        
        xx_grad=np.gradient(xx1a) 
        x_min5=np.roll(xx_grad, 3)
        #df1[name4]=xx_mf
        if ii in feats01: 
            df1[name1]=x_min3
            df1[name2]=xx_fil
            df1[name3]=xx_grad
            df1[name4]=xx_mf 
            df1[name5]=x_min1
            df1[name6]=x_min2
            df1[name7]=x_min4
            #df1[name8]=x_min5
            #df1[name9]=x_min2
        if ii in feats02:
            df1[name1]=x_min3
            df1[name2]=xx_fil
            df1[name3]=xx_grad
            #df1[name4]=xx_mf 
            df1[name5]=x_min1
            #df1[name6]=x_min2 
            #df1[name7]=x_min4
    return df1

In [8]:
#filename = 'training_data.csv'
filename = 'facies_vectors.csv'
training_data0 = pd.read_csv(filename)
filename = 'validation_data_nofacies.csv'
test_data = pd.read_csv(filename)

In [10]:
all_wells=training_data0['Well Name'].unique()
print all_wells


['SHRIMPLIN' 'ALEXANDER D' 'SHANKLE' 'LUKE G U' 'KIMZEY A' 'CROSS H CATTLE'
 'NOLAN' 'Recruit F9' 'NEWBY' 'CHURCHMAN BIBLE']

In [11]:
# what to do with the naans
training_data1=training_data0.copy()
me_tot=training_data1['PE'].median()
print me_tot
for well in all_wells:
    df=training_data0[training_data0['Well Name'] == well] 
    print well
    print len(df)
    df0=df.dropna()
    #print len(df0)
    if len(df0) > 0:
        print "using median of local"
        me=df['PE'].median()
        df=df.fillna(value=me)
    else:
        print "using median of total"
        df=df.fillna(value=me_tot)
    training_data1[training_data0['Well Name'] == well] =df
    

print len(training_data1)
df0=training_data1.dropna()
print len(df0)


3.5515
SHRIMPLIN
471
using median of local
ALEXANDER D
466
using median of total
SHANKLE
449
using median of local
LUKE G U
461
using median of local
KIMZEY A
439
using median of total
CROSS H CATTLE
501
using median of local
NOLAN
415
using median of local
Recruit F9
80
using median of local
NEWBY
463
using median of local
CHURCHMAN BIBLE
404
using median of local
4149
4149

In [12]:
#remove outliers
df=training_data1.copy()
print len(df)
df0=df.dropna()
print len(df0)
df1 = df0.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
#df=pd.DataFrame(np.random.randn(20,3))
#df.iloc[3,2]=5
print len(df1)
df2=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
print len(df2)


4149
4149
4149
4143

In [13]:
df2a=df2[df2['Well Name'] != 'Recruit F9']

In [14]:
data_all=create_structure_for_regimes(df2a)
data_test=create_structure_for_regimes(test_data)

In [180]:
data_test


Out[180]:
Well Name GR_25% GR_50% GR_75% GR_max GR_mean GR_min GR_std ILD_log10_25% ILD_log10_50% ... NM_M_mean NM_M_min NM_M_std RELPOS_25% RELPOS_50% RELPOS_75% RELPOS_max RELPOS_mean RELPOS_min RELPOS_std
0 CRAWFORD 41.1465 58.413 73.55225 167.803 58.666020 16.197 26.033591 0.46725 0.669 ... 1.741573 1.0 0.438386 0.2775 0.537 0.7765 1.0 0.52623 0.017 0.287468
1 STUART 34.5620 57.851 72.50275 220.413 56.819901 12.036 28.600303 0.57525 0.677 ... 1.630802 1.0 0.483098 0.3180 0.550 0.7795 1.0 0.54300 0.013 0.279796

2 rows × 50 columns

kmeans clustering to find natural clusters:


In [624]:
frames = [data_all, data_test]
X = pd.concat(frames)
X


Out[624]:
Well Name GR_10% GR_25% GR_50% GR_75% GR_90% GR_max GR_mean GR_min GR_std ... NM_M_std RELPOS_10% RELPOS_25% RELPOS_50% RELPOS_75% RELPOS_90% RELPOS_max RELPOS_mean RELPOS_min RELPOS_std
0 ALEXANDER D 25.9700 44.8650 72.285 88.92750 100.7850 168.430 68.280236 13.340 28.635640 ... 0.497648 0.1125 0.2670 0.518 0.76125 0.9105 1.0 0.515584 0.000 0.289040
1 CHURCHMAN BIBLE 33.6532 41.3755 59.203 74.31250 95.6691 222.500 63.683453 14.469 33.333992 ... 0.448185 0.1265 0.2860 0.567 0.80000 0.9290 1.0 0.544317 0.011 0.293647
2 CROSS H CATTLE 50.4050 60.1980 73.595 84.81900 94.6482 184.021 72.423513 13.893 20.851035 ... 0.458631 0.1066 0.2580 0.500 0.75000 0.9000 1.0 0.503288 0.013 0.288401
3 KIMZEY A 23.5474 34.8140 56.582 66.58800 73.8642 245.360 53.334339 10.149 24.761446 ... 0.487142 0.1328 0.3040 0.548 0.78500 0.9202 1.0 0.539453 0.010 0.284105
4 LUKE G U 29.6800 48.4500 66.190 78.49000 87.3500 195.890 64.777223 13.530 27.427174 ... 0.499410 0.1190 0.2630 0.522 0.76100 0.9120 1.0 0.514095 0.010 0.286902
5 NEWBY 28.4880 38.1100 60.330 71.44500 85.7180 305.870 60.733045 13.920 33.620632 ... 0.486991 0.1234 0.2930 0.535 0.77150 0.9198 1.0 0.528138 0.010 0.285096
6 NOLAN 23.7500 43.3905 72.375 88.50000 102.5250 247.500 68.693940 13.250 32.730216 ... 0.499834 0.1336 0.3070 0.577 0.79350 0.9206 1.0 0.549007 0.017 0.285563
7 SHANKLE 35.0760 54.9600 66.600 75.15000 83.1820 242.750 65.431180 18.400 25.696418 ... 0.474493 0.1188 0.2580 0.500 0.74400 0.8952 1.0 0.503118 0.010 0.282082
8 SHRIMPLIN 28.6580 46.8300 69.300 85.85000 96.3580 274.000 68.169936 13.280 32.170245 ... 0.500342 0.1110 0.2590 0.511 0.76500 0.9154 1.0 0.514399 0.010 0.290373
0 CRAWFORD 24.9075 41.1465 58.413 73.55225 84.7215 167.803 58.666020 16.197 26.033591 ... 0.438386 0.1240 0.2775 0.537 0.77650 0.9145 1.0 0.526230 0.017 0.287468
1 STUART 24.1612 34.5620 57.851 72.50275 82.7020 220.413 56.819901 12.036 28.600303 ... 0.483098 0.1436 0.3180 0.550 0.77950 0.9257 1.0 0.543000 0.013 0.279796

11 rows × 64 columns


In [626]:
X1 = X.drop(['Well Name'], axis=1)

In [630]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2).fit(X1)
kmeans.labels_


Out[630]:
array([1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0], dtype=int32)

In [1793]:
kmeans = KMeans(n_clusters=4).fit(X1)
kmeans.labels_


Out[1793]:
array([0, 2, 0, 3, 0, 1, 1, 2, 1, 0, 3], dtype=int32)

Through experimenting with the cluster size I've decided on 4 clusters. This corresponds largely with the corresponding similarity in facies distribution

CRAWFORD is most similar to ALEXANDER and LUKE. This will called cluster 1. (The only ones with facies 1)

STUART is most similar to KIMZEY and NOLAN This will be called cluster 2

Collating the Data:

based on the regimes we determined


In [15]:
# based on kmeans clustering
data=[]
df = training_data0[training_data0['Well Name'] == 'ALEXANDER D'] 
data.append(df)
df = training_data0[training_data0['Well Name'] == 'LUKE G U']  
data.append(df)
Regime_1 = pd.concat(data, axis=0)
print len(Regime_1)

data=[]
df = training_data0[training_data0['Well Name'] == 'KIMZEY A']  
data.append(df)
df = training_data0[training_data0['Well Name'] == 'NOLAN']  
data.append(df)
Regime_2 = pd.concat(data, axis=0)
print len(Regime_2)

data=[]
df = training_data0[training_data0['Well Name'] == 'CHURCHMAN BIBLE']  
data.append(df)
df = training_data0[training_data0['Well Name'] == 'SHRIMPLIN']  
data.append(df)
df = training_data0[training_data0['Well Name'] == 'NEWBY']  
data.append(df)
df = training_data0[training_data0['Well Name'] == 'Recruit F9']  
data.append(df)
Regime_3 = pd.concat(data, axis=0)
print len(Regime_3)

data=[]
df = training_data0[training_data0['Well Name'] == 'SHANKLE'] 
data.append(df)
df = training_data0[training_data0['Well Name'] == 'CROSS H CATTLE']  
data.append(df)
Regime_4 = pd.concat(data, axis=0)
print len(Regime_4)


927
854
1418
950

Split the data into 2 parts:

from A We will make initial predictions

from B we will make the final prediction(s)


In [64]:
df0=Regime_1.dropna()
df1 = df0.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
df1a=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
df2a=magic(df1a)


y=df2a['Facies'].values
X=df2a.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
regime1A_train, regime1B_train, regime1A_test, regime1B_test = train_test_split(X, y, test_size=0.5, random_state=42)


df0=Regime_2.dropna()
df1 = df0.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
df1a=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
df2b=magic(df1a)
y= df2b['Facies'].values
X= df2b.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
regime2A_train, regime2B_train, regime2A_test, regime2B_test= train_test_split(X, y, test_size=0.5, random_state=42)

df0=Regime_3.dropna()
df1 = df0.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
df1a=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
df2c=magic(df1a)
y=df2c['Facies'].values
X= df2c.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
regime3A_train, regime3B_train, regime3A_test, regime3B_test = train_test_split(X, y, test_size=0.5, random_state=42)

df0=Regime_4.dropna()
df1 = df0.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
df1a=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
df2d=magic(df1a)
y=df2d['Facies'].values
X= df2d.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
regime4A_train, regime4B_train, regime4A_test, regime4B_test = train_test_split(X, y, test_size=0.5, random_state=42)

Phase 1a:

-Create predictions specifically for the most difficult facies

-at this stage we focus on TP and FP only


training for facies 9 specifically



In [22]:
df0 = test_data[test_data['Well Name'] == 'CRAWFORD']  
df1 = df0.drop(['Formation', 'Well Name', 'Depth'], axis=1)
df1a=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
blind=magic(df1a)

In [23]:
df1a.head()


Out[23]:
Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
474 A1 LM CRAWFORD 2972.5 49.675 0.845 3.905 11.175 3.265 2 1.000
475 A1 LM CRAWFORD 2973.0 34.435 0.879 3.085 8.175 3.831 2 0.991
476 A1 LM CRAWFORD 2973.5 26.178 0.920 2.615 4.945 4.306 2 0.981
477 A1 LM CRAWFORD 2974.0 19.463 0.967 0.820 3.820 4.578 2 0.972
478 A1 LM CRAWFORD 2974.5 19.260 0.995 0.320 3.630 4.643 2 0.962

In [62]:
features_blind = blind.drop(['Formation', 'Well Name', 'Depth'], axis=1)

#============================================================
df0=training_data0.dropna()
df1 = df0.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
df1a=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
all1=magic(df1a)
#X, y = make_balanced_binary(all1, 9,6)
X, y = make_balanced_binary(all1, 9,9)
#============================================================
correct_train=y

clf = RandomForestClassifier(max_depth = 6, n_estimators=600)
clf.fit(X,correct_train)

predicted_blind1 = clf.predict(features_blind)

predicted_regime9=predicted_blind1.copy()


ratio now:
19.0434782609
ratio after:
2.11594202899

training for facies 1 specifically



In [28]:
features_blind = blind.drop(['Formation', 'Well Name', 'Depth'], axis=1)

#============================================================
df0=training_data0.dropna()
df1 = df0.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
df1a=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
all1=magic(df1a)
X, y = make_balanced_binary(all1, 1,3)
#============================================================



#=============================================
go_A=StandardScaler().fit_transform(X)
go_blind=StandardScaler().fit_transform(features_blind)
correct_train_A=binarify(y, 1)
                                        

clf = linear_model.LogisticRegression()
clf.fit(go_A,correct_train_A)
predicted_blind1 = clf.predict(go_blind)

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(go_A,correct_train_A)                                                  
predicted_blind2 = clf.predict(go_blind)

clf = svm.SVC(decision_function_shape='ovo')
clf.fit(go_A,correct_train_A)   
predicted_blind3 = clf.predict(go_blind)

clf = svm.LinearSVC()
clf.fit(go_A,correct_train_A)   
predicted_blind4 = clf.predict(go_blind)



#####################################
predicted_blind=predicted_blind1+predicted_blind2+predicted_blind3+predicted_blind4
for ii in range(len(predicted_blind)):
    if predicted_blind[ii] > 3:
        predicted_blind[ii]=1
    else:
        predicted_blind[ii]=0 
        
for ii in range(len(predicted_blind)):
    if predicted_blind[ii] == 1 and predicted_blind[ii-1] == 0 and predicted_blind[ii+1] == 0:
        predicted_blind[ii]=0
    if predicted_blind[ii] == 1 and predicted_blind[ii-1] == 0 and predicted_blind[ii+2] == 0:
        predicted_blind[ii]=0        
    if predicted_blind[ii] == 1 and predicted_blind[ii-2] == 0 and predicted_blind[ii+1] == 0:
        predicted_blind[ii]=0     
#####################################    

print "-------"
predicted_regime1=predicted_blind.copy()


ratio now:
11.4594594595
ratio after:
3.81981981982
-------

training for facies 5 specifically


In [30]:
features_blind = blind.drop(['Formation', 'Well Name', 'Depth'], axis=1)

#============================================================
df0=training_data0.dropna()
df1 = df0.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
df1a=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
all1=magic(df1a)
X, y = make_balanced_binary(all1, 5,13)
#============================================================

go_A=StandardScaler().fit_transform(X)
go_blind=StandardScaler().fit_transform(features_blind)
correct_train_A=binarify(y, 1)
#=============================================                                        

clf = KNeighborsClassifier(n_neighbors=4,algorithm='brute')
clf.fit(go_A,correct_train_A)
predicted_blind1 = clf.predict(go_blind)

clf = KNeighborsClassifier(n_neighbors=5,leaf_size=10)
clf.fit(go_A,correct_train_A)                                                  
predicted_blind2 = clf.predict(go_blind)

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(go_A,correct_train_A)   
predicted_blind3 = clf.predict(go_blind)

clf = tree.DecisionTreeClassifier()
clf.fit(go_A,correct_train_A)   
predicted_blind4 = clf.predict(go_blind)

clf = tree.DecisionTreeClassifier()
clf.fit(go_A,correct_train_A)   
predicted_blind5 = clf.predict(go_blind)

clf = tree.DecisionTreeClassifier()
clf.fit(go_A,correct_train_A)    
predicted_blind6 = clf.predict(go_blind)


#####################################
predicted_blind=predicted_blind1+predicted_blind2+predicted_blind3+predicted_blind4+predicted_blind5+predicted_blind6
for ii in range(len(predicted_blind)):
    if predicted_blind[ii] > 4:
        predicted_blind[ii]=1
    else:
        predicted_blind[ii]=0 
#####################################   
print "-------"


#####################################    

print "-------"
predicted_regime5=predicted_blind.copy()


ratio now:
13.8709677419
ratio after:
1.06699751861
-------
-------

training for facies 7 specifically


In [111]:
features_blind = blind.drop(['Formation', 'Well Name', 'Depth'], axis=1)

#============================================================
df0=training_data0.dropna()
df1 = df0.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
df1a=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
all1=magic(df1a)
X, y = make_balanced_binary(all1, 7,13)
#============================================================

go_A=StandardScaler().fit_transform(X)
go_blind=StandardScaler().fit_transform(features_blind)
correct_train_A=binarify(y, 1)
#=============================================                                        

clf = KNeighborsClassifier(n_neighbors=4,algorithm='brute')
clf.fit(go_A,correct_train_A)
predicted_blind1 = clf.predict(go_blind)


clf = KNeighborsClassifier(n_neighbors=5,leaf_size=10)
clf.fit(go_A,correct_train_A)                                                  
predicted_blind2 = clf.predict(go_blind)

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(go_A,correct_train_A)   
predicted_blind3 = clf.predict(go_blind)

clf = tree.DecisionTreeClassifier()
clf.fit(go_A,correct_train_A)   
predicted_blind4 = clf.predict(go_blind)

clf = tree.DecisionTreeClassifier()
clf.fit(go_A,correct_train_A)   
predicted_blind5 = clf.predict(go_blind)

clf = tree.DecisionTreeClassifier()
clf.fit(go_A,correct_train_A)    
predicted_blind6 = clf.predict(go_blind)


#####################################
predicted_blind=predicted_blind1+predicted_blind2+predicted_blind3+predicted_blind4+predicted_blind5+predicted_blind6
for ii in range(len(predicted_blind)):
    if predicted_blind[ii] > 5:
        predicted_blind[ii]=1
    else:
        predicted_blind[ii]=0 


#####################################    

print "-------"
predicted_regime7=predicted_blind.copy()
print sum(predicted_regime7)


ratio now:
31.9285714286
ratio after:
2.45604395604
-------
7

PHASE Ib

Making several predictions using dataset A

PREPARE THE BLIND DATA FOR SERIAL MODELLING


In [1856]:
#
#blindwell='CHURCHMAN BIBLE'

#df0 = training_data0[training_data0['Well Name'] == blindwell]  
#df1 = df0.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
#df1a=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
#blind=magic(df1a)
#correct_facies_labels = blind['Facies'].values
#features_blind = blind.drop(['Formation', 'Well Name', 'Depth','Facies'], axis=1)
#pred_blind=0*correct_facies_labels

In [65]:
df0 = test_data[test_data['Well Name'] == 'CRAWFORD']  
df1 = df0.drop(['Formation', 'Well Name', 'Depth'], axis=1)
df1a=df0[(np.abs(stats.zscore(df1))<8).all(axis=1)]
blind=magic(df1a)
pred_blind=0*predicted_regime7

PREPARE THE DATA FOR SERIAL MODELLING

This could be done smarter but basically manual at this point

selecting bias towards the REGIME the blind data has been classified as

for CHURCHMAN BIBLE this is regime 3

For CRAWFORD this is regime 1

For STUART this is regime 2


In [66]:
main_regime=regime1A_train
other1=regime1A_train
other2=regime2A_train
other3=regime3A_train

main_test=regime1A_test
other1_test=regime1A_test
other2_test=regime2A_test
other3_test=regime3A_test

tmp2=[regime1B_train, regime2B_train, regime3B_train, regime4B_train] 
go_B= pd.concat(tmp2, axis=0)
correctB=np.concatenate((regime1B_test, regime2B_test, regime3B_test, regime4B_test))

In [76]:
#===================================================
tmp1=[main_regime, other1, other2, other3] 
regime_train1= pd.concat(tmp1, axis=0)
correctA1=np.concatenate((main_test, other1_test, other2_test, other3_test))
#=================================================== 
tmp1=[main_regime, other2, other3] 
regime_train2= pd.concat(tmp1, axis=0)
correctA2=np.concatenate((main_test, other2_test, other3_test))
#===================================================
tmp1=[main_regime, other1, other3] 
regime_train3= pd.concat(tmp1, axis=0)
correctA3=np.concatenate((main_test, other1_test, other3_test))
#=================================================== 
tmp1=[main_regime, other1, other2] 
regime_train4= pd.concat(tmp1, axis=0)
correctA4=np.concatenate((main_test, other1_test, other2_test))
#===================================================
tmp1=[main_regime, other1] 
regime_train5= pd.concat(tmp1, axis=0)
correctA5=np.concatenate((main_test, other1_test))
#===================================================
tmp1=[main_regime, other2] 
regime_train6= pd.concat(tmp1, axis=0)
correctA6=np.concatenate((main_test, other2_test))
#===================================================
tmp1=[main_regime, other3] 
regime_train7= pd.concat(tmp1, axis=0)
correctA7=np.concatenate((main_test, other3_test))
#===================================================
tmp1=[main_regime] 
regime_train8= pd.concat(tmp1, axis=0)
correctA8=main_test

Create several predictions, varying the dataset and the technique


In [84]:
pred_array=0*correctB
pred_blind=np.zeros(len(features_blind))

print "logistic regression"
clf = linear_model.LogisticRegression(class_weight='balanced')
pred_array, pred_blind=phaseI_model_scaled(regime_train1, correctA1, go_B, clf, pred_array, pred_blind, features_blind)
print "svm"
clf = svm.SVC(decision_function_shape='ovo',class_weight='balanced')
pred_array, pred_blind=phaseI_model_scaled(regime_train2, correctA2, go_B, clf, pred_array, pred_blind, features_blind)
print "svm lin"
clf = svm.LinearSVC(class_weight='balanced')
pred_array, pred_blind=phaseI_model_scaled(regime_train3, correctA3, go_B, clf, pred_array, pred_blind, features_blind)
print "svm3"
clf = svm.SVC(C = 100, cache_size=2400, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=True, random_state=49, shrinking=True,
  tol=0.001, verbose=False)
pred_array, pred_blind=phaseI_model_scaled(regime_train4, correctA4, go_B, clf, pred_array, pred_blind, features_blind)
print "knn0"
clf = KNeighborsClassifier(n_neighbors=8)
pred_array, pred_blind=phaseI_model_scaled(regime_train4, correctA4, go_B, clf, pred_array, pred_blind, features_blind)
print "knn1"
clf = KNeighborsClassifier(n_neighbors=4,algorithm='brute')
pred_array, pred_blind=phaseI_model_scaled(regime_train5, correctA5, go_B, clf, pred_array, pred_blind, features_blind)
print "knn2"
clf = KNeighborsClassifier(n_neighbors=5,leaf_size=10,p=1)
pred_array, pred_blind=phaseI_model_scaled(regime_train6, correctA6, go_B, clf, pred_array, pred_blind, features_blind)
print "knn3"
clf = KNeighborsClassifier(n_neighbors=7,p=1)
pred_array, pred_blind=phaseI_model_scaled(regime_train7, correctA7, go_B, clf, pred_array, pred_blind, features_blind)
print "knn3"
clf = KNeighborsClassifier(n_neighbors=6,p=1)
pred_array, pred_blind=phaseI_model_scaled(regime_train1, correctA1, go_B, clf, pred_array, pred_blind, features_blind)
print "knn4"
clf = KNeighborsClassifier(n_neighbors=8,p=1)
regime_train1a=regime_train1.drop(['PE','GR_2','GR_3','GR_4','GR_5','GR_6','GR_7','PHIND_2','PHIND_3','PHIND_5'], axis=1)
go_Ba=go_B.drop(['PE','GR_2','GR_3','GR_4','GR_5','GR_6','GR_7','PHIND_2','PHIND_3','PHIND_5'], axis=1)
features_blinda=features_blind.drop(['PE','GR_2','GR_3','GR_4','GR_5','GR_6','GR_7','PHIND_2','PHIND_3','PHIND_5'], axis=1)
pred_array, pred_blind=phaseI_model_scaled(regime_train1a, correctA1, go_Ba, clf, pred_array, pred_blind, features_blinda)
print "rf1"
clf = RandomForestClassifier(max_depth = 5, n_estimators=600)
pred_array, pred_blind=phaseI_model(regime_train2, correctA2, go_B, clf, pred_array, pred_blind, features_blind)
print "rf2"
clf = RandomForestClassifier(max_depth = 8, n_estimators=700)
pred_array, pred_blind=phaseI_model(regime_train3, correctA3, go_B, clf, pred_array, pred_blind, features_blind)
print "rf3"
clf = RandomForestClassifier(max_depth = 15, n_estimators=2000)
pred_array, pred_blind=phaseI_model(regime_train4, correctA4, go_B, clf, pred_array, pred_blind, features_blind)
print "rf4"
clf = RandomForestClassifier(max_depth = 7, n_estimators=800)
pred_array, pred_blind=phaseI_model(regime_train5, correctA5, go_B, clf, pred_array, pred_blind, features_blind)
print "rf5"
clf = RandomForestClassifier(max_depth = 10, n_estimators=1200,min_samples_leaf=12)
pred_array, pred_blind=phaseI_model(regime_train6, correctA6, go_B, clf, pred_array, pred_blind, features_blind)
print "rf6"
clf = RandomForestClassifier(max_depth = 12, n_estimators=800,min_samples_leaf=14)
pred_array, pred_blind=phaseI_model(regime_train7, correctA7, go_B, clf, pred_array, pred_blind, features_blind)
print "rf7"
clf= RandomForestClassifier(max_depth = 15, n_estimators=1600,min_samples_leaf=15)
pred_array, pred_blind=phaseI_model(regime_train1, correctA1, go_B, clf, pred_array, pred_blind, features_blind)
print "rf8"
clf= RandomForestClassifier(max_depth = 15, n_estimators=1600,min_samples_leaf=15)
pred_array, pred_blind=phaseI_model(regime_train8, correctA8, go_B, clf, pred_array, pred_blind, features_blind)
print "mlp"
clf= MLPClassifier(activation='logistic', alpha=0.01, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=49, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
pred_array, pred_blind=phaseI_model_scaled(regime_train1, correctA1, go_B, clf, pred_array, pred_blind, features_blind)
print "gradient boost"
clf = GradientBoostingClassifier()
pred_array, pred_blind=phaseI_model(regime_train1, correctA1, go_B, clf, pred_array, pred_blind, features_blind)


logistic regression
svm
svm lin
svm3
knn0
knn1
knn2
knn3
knn3
knn4
rf1
rf2
rf3
rf4
rf5
rf6
rf7
rf8
mlp
/home/sherlock/anaconda/envs/Python2/lib/python2.7/site-packages/sklearn/neural_network/multilayer_perceptron.py:563: ConvergenceWarning:

Stochastic Optimizer: Maximum iterations reached and the optimization hasn't converged yet.

gradient boost

Phase II:

Stacking the predictions from phase Ib. New predictions from data B


First prediction of B data without Phase I input:


In [85]:
print len(correctB)


1616

In [86]:
clf = RandomForestClassifier(max_depth = 15, n_estimators=1600,min_samples_leaf=15)
clf.fit(go_B,correctB)
predicted_blind_PHASE_I = clf.predict(features_blind)
#out_f1=metrics.f1_score(correct_facies_labels, predicted_blind_PHASE_I, average = 'micro')
#print "f1 score on the prediction of blind"
#print out_f1

In [89]:
predicted_blind_PHASE_I


Out[89]:
array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 4, 4, 6, 8,
       8, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8,
       8, 8, 8, 8, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7,
       7, 7, 7, 7, 8, 8, 8, 8, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 3, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       3, 3, 2, 8, 8, 8, 8, 8, 8, 8, 8, 6, 7, 7, 7, 9, 7, 7, 7, 1, 1, 1, 1,
       1, 1, 8, 1, 1, 1, 1, 7, 8, 8, 8, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       4, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 8, 8, 2, 2, 3, 3, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3])

Add the initial predictions as features:


In [87]:
go_B_PHASE_II=go_B.copy()
features_blind_PHASE_II=features_blind.copy()
for kk in range(len(pred_array)-1):
    name='prediction_' + str(kk) 
    featB=pred_array[kk,:]
    go_B_PHASE_II[name]=featB
    feat=pred_blind[kk,:]
    features_blind_PHASE_II[name]=feat

Make a new prediction, with the best model on the full dataset B:


In [88]:
clf.fit(go_B_PHASE_II,correctB)
predicted_blind_PHASE_II = clf.predict(features_blind_PHASE_II)
#out_f1=metrics.f1_score(correct_facies_labels, predicted_blind_PHASE_II, average = 'micro')
#print "f1 score on the prediction of blind"
#print out_f1

Do this with a couple of predictions and average

To Do


In [ ]:


In [ ]:


In [ ]:

Permute facies based on earlier prediction:


In [112]:
print(sum(predicted_regime5))
predicted_blind_PHASE_IIa=permute_facies_nr(predicted_regime5, predicted_blind_PHASE_II, 5)
print(sum(predicted_regime7))
predicted_blind_PHASE_IIb=permute_facies_nr(predicted_regime7, predicted_blind_PHASE_IIa, 7)
print(sum(predicted_regime1))
predicted_blind_PHASE_IIc=permute_facies_nr(predicted_regime1, predicted_blind_PHASE_IIb, 1)
print(sum(predicted_regime9))
predicted_blind_PHASE_IId=permute_facies_nr(predicted_regime9, predicted_blind_PHASE_IIc, 9)


4
7
7
0

In [113]:
predicted_CRAWFORD=predicted_blind_PHASE_IId
predicted_CRAWFORD


Out[113]:
array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 7, 7, 7, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8,
       8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 3, 6, 8, 8, 6, 6, 6, 6, 6, 6, 6, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       8, 8, 8, 8, 8, 8, 6, 6, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 2, 2, 2, 2, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7,
       7, 7, 7, 7, 8, 8, 8, 8, 4, 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 3, 3, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 2,
       2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 8, 8, 8, 8,
       8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 3, 2, 2, 2, 2, 2, 2, 3, 3,
       2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, 5, 7, 7, 7, 7, 7, 7, 7, 3, 3, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 7, 7, 8, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       3, 3, 3, 3, 4, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 4, 3, 3, 3, 3, 3, 2, 2,
       2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])

In [ ]:


In [ ]:


In [ ]: