Facies classification using Machine Learning

Bird Team: PG+AC


In [1]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None

In [34]:
filename = './../training_data.csv'
training_data = pd.read_csv(filename)
training_data.head(10)


Out[34]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 3 A1 SH SHRIMPLIN 2793.0 77.45 0.664 9.9 11.915 4.6 1 1.000
1 3 A1 SH SHRIMPLIN 2793.5 78.26 0.661 14.2 12.565 4.1 1 0.979
2 3 A1 SH SHRIMPLIN 2794.0 79.05 0.658 14.8 13.050 3.6 1 0.957
3 3 A1 SH SHRIMPLIN 2794.5 86.10 0.655 13.9 13.115 3.5 1 0.936
4 3 A1 SH SHRIMPLIN 2795.0 74.58 0.647 13.5 13.300 3.4 1 0.915
5 3 A1 SH SHRIMPLIN 2795.5 73.97 0.636 14.0 13.385 3.6 1 0.894
6 3 A1 SH SHRIMPLIN 2796.0 73.72 0.630 15.6 13.930 3.7 1 0.872
7 3 A1 SH SHRIMPLIN 2796.5 75.65 0.625 16.5 13.920 3.5 1 0.830
8 3 A1 SH SHRIMPLIN 2797.0 73.79 0.624 16.2 13.980 3.4 1 0.809
9 3 A1 SH SHRIMPLIN 2797.5 76.89 0.615 16.9 14.220 3.5 1 0.787

In [3]:
set(training_data["Well Name"])


Out[3]:
{'CHURCHMAN BIBLE',
 'CROSS H CATTLE',
 'LUKE G U',
 'NEWBY',
 'NOLAN',
 'Recruit F9',
 'SHANKLE',
 'SHRIMPLIN'}

In [35]:
well_data = pd.read_csv('./../validation_data_nofacies.csv')
well_data.head(10)


Out[35]:
Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS
0 A1 SH STUART 2808.0 66.276 0.630 3.3 10.65 3.591 1 1.000
1 A1 SH STUART 2808.5 77.252 0.585 6.5 11.95 3.341 1 0.978
2 A1 SH STUART 2809.0 82.899 0.566 9.4 13.60 3.064 1 0.956
3 A1 SH STUART 2809.5 80.671 0.593 9.5 13.25 2.977 1 0.933
4 A1 SH STUART 2810.0 75.971 0.638 8.7 12.35 3.020 1 0.911
5 A1 SH STUART 2810.5 73.955 0.667 6.9 12.25 3.086 1 0.889
6 A1 SH STUART 2811.0 77.962 0.674 6.5 12.45 3.092 1 0.867
7 A1 SH STUART 2811.5 83.894 0.667 6.3 12.65 3.123 1 0.844
8 A1 SH STUART 2812.0 84.424 0.653 6.7 13.05 3.121 1 0.822
9 A1 SH STUART 2812.5 83.160 0.642 7.3 12.95 3.127 1 0.800

In [5]:
set(well_data["Well Name"])


Out[5]:
{'CRAWFORD', 'STUART'}

In [36]:
# concat train and test for processing 
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)


Out[36]:
Facies Formation Well Name Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS origin
0 NaN A1 SH STUART 2808.0 66.276 0.630 3.3 10.65 3.591 1 1.000 test
1 NaN A1 SH STUART 2808.5 77.252 0.585 6.5 11.95 3.341 1 0.978 test
2 NaN A1 SH STUART 2809.0 82.899 0.566 9.4 13.60 3.064 1 0.956 test
3 NaN A1 SH STUART 2809.5 80.671 0.593 9.5 13.25 2.977 1 0.933 test
4 NaN A1 SH STUART 2810.0 75.971 0.638 8.7 12.35 3.020 1 0.911 test
5 NaN A1 SH STUART 2810.5 73.955 0.667 6.9 12.25 3.086 1 0.889 test
6 NaN A1 SH STUART 2811.0 77.962 0.674 6.5 12.45 3.092 1 0.867 test
7 NaN A1 SH STUART 2811.5 83.894 0.667 6.3 12.65 3.123 1 0.844 test
8 NaN A1 SH STUART 2812.0 84.424 0.653 6.7 13.05 3.121 1 0.822 test
9 NaN A1 SH STUART 2812.5 83.160 0.642 7.3 12.95 3.127 1 0.800 test

In [37]:
# add some features based on the well data. 

# nb points : can be correlated with how soft soil is ? 
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns =  ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

# depth : 
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns =  ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns =  ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]

df["soft_indic"] = df['formation_depth'] / df["formation_size"]

# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg : 
    df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
    df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
    df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
    df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var) 

# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg : 
    df[val + "_min_dist"] = df[val] -df[val + "_min"]
    df[val + "_max_dist"] =  df[val] -df[val + "_max"]
    df[val + "_mean_dist"] =  df[val] -df[val + "_mean"]


session
depth
add avgs of feat
add distances feat.

In [38]:
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
    for lag in range(1,11):
        df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
        df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)

# adding some Formation lag and lead. 
for lag in range(1,3):
    df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
    df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
    df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
    df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int)


lag lead

In [32]:
type(df)


Out[32]:
NoneType

In [39]:
#df = df.fillna(-9999)
df = df.fillna(method="bfill")
df = df.fillna(method="ffill")

In [40]:
df.shape


Out[40]:
(4062, 241)

In [41]:
[c for c in df.columns if "Formation" in c]


Out[41]:
['Formation',
 'Formation_lag_1',
 'Formation_lead_1',
 'Formation_lag_1equal',
 'Formation_lead_1equal',
 'Formation_lag_2',
 'Formation_lead_2',
 'Formation_lag_2equal',
 'Formation_lead_2equal']

In [59]:
tokeep =['Facies','origin','Formation','Well Name','Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
nums = ['Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
tokeep = tokeep + [x+'_lag_1' for x in nums] +[x+'_lead_1' for x in nums]
df = df[tokeep]

In [60]:
# count vectorizer formation
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

counts = cv.fit_transform(df['Formation'].values)

In [61]:
cols = cv.get_feature_names()

In [62]:
counts = pd.DataFrame(counts.toarray(),columns = cols)
df = df.drop('Formation',axis = 1)
df = pd.concat([df,counts],axis=1)
df.shape


Out[62]:
(4062, 35)

In [44]:
df['Formation']


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-44-edfc602003cf> in <module>()
----> 1 df['Formation']

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2069         result = self._constructor(self._data.get(key))
   2070         if result.columns.is_unique:
-> 2071             result = result[key]
   2072 
   2073         return result

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3539 
   3540             if not isnull(item):
-> 3541                 loc = self.items.get_loc(item)
   3542             else:
   3543                 indexer = np.arange(len(self.items))[isnull(self.items)]

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4443)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4289)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13733)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13687)()

KeyError: 'Formation'

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation','Formation_lag_1','Formation_lead_1','Formation_lag_2','Formation_lead_2']
for l in list_formation:
    cv = CountVectorizer()
    counts = cv.fit_transform(df[l].values)
    cols = cv.get_feature_names()
    counts = pd.DataFrame(counts.toarray(),columns = cols)
    df = df.drop(l,axis = 1)
    df = pd.concat([df,counts],axis=1)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-43-bb83c4d92138> in <module>()
      7 for l in list_formation:
      8     cv = CountVectorizer()
----> 9     counts = cv.fit_transform(df[l].values)
     10     cols = cv.get_feature_names()
     11     counts = pd.DataFrame(counts.toarray(),columns = cols)

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2069         result = self._constructor(self._data.get(key))
   2070         if result.columns.is_unique:
-> 2071             result = result[key]
   2072 
   2073         return result

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3539 
   3540             if not isnull(item):
-> 3541                 loc = self.items.get_loc(item)
   3542             else:
   3543                 indexer = np.arange(len(self.items))[isnull(self.items)]

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4443)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4289)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13733)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13687)()

KeyError: 'Formation'

CV performance


In [63]:
# params
max_depth = 8 
n_estimators = 2000
clf = RandomForestClassifier(max_depth = max_depth,n_estimators=n_estimators)

In [65]:
ytrain = df[(df['origin']=='train')&(df['Well Name']<>'CHURCHMAN BIBLE')]['Facies']
yvalid = df[(df['origin']=='train')&(df['Well Name']=='CHURCHMAN BIBLE')]['Facies']
xtrain = df[(df['origin']=='train')&(df['Well Name']<>'CHURCHMAN BIBLE')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='train')&(df['Well Name']=='CHURCHMAN BIBLE')].drop(['Well Name','origin','Facies'],axis=1)

In [66]:
clf.fit(xtrain,ytrain)


Out[66]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [68]:
preds = clf.predict(xvalid)

In [69]:
from sklearn.metrics import classification_report
print classification_report(yvalid, preds)


             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00         8
        2.0       0.66      0.62      0.64        56
        3.0       0.61      0.71      0.65        51
        4.0       0.33      0.69      0.45        13
        5.0       0.54      0.67      0.60        30
        6.0       0.58      0.68      0.63        87
        7.0       0.00      0.00      0.00        34
        8.0       0.55      0.48      0.51        75
        9.0       0.81      1.00      0.89        50

avg / total       0.55      0.61      0.57       404

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Apply to test


In [70]:
# this time let's use all the training set 
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)

In [71]:
clf.fit(xtrain,ytrain)


Out[71]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [73]:
preds = clf.predict(xvalid.values)

In [173]:
# preds

In [74]:
xvalid['Facies']=preds
xvalid.to_csv('XmasPreds.csv')