Facies classification using Machine Learning

Bird Team: PG+AC



In [1]:

    
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable

from pandas import set_option
set_option("display.max_rows", 10)
pd.options.mode.chained_assignment = None



In [34]:

    
filename = './../training_data.csv'
training_data = pd.read_csv(filename)
training_data.head(10)



In [3]:

    
set(training_data["Well Name"])









    Out[3]:





{'CHURCHMAN BIBLE',
 'CROSS H CATTLE',
 'LUKE G U',
 'NEWBY',
 'NOLAN',
 'Recruit F9',
 'SHANKLE',
 'SHRIMPLIN'}



In [35]:

    
well_data = pd.read_csv('./../validation_data_nofacies.csv')
well_data.head(10)



In [5]:

    
set(well_data["Well Name"])









    Out[5]:





{'CRAWFORD', 'STUART'}



In [36]:

    
# concat train and test for processing 
well_data["origin"] = 'test'
training_data["origin"] = 'train'
df = pd.concat([well_data,training_data],axis=0,ignore_index=True)[list(training_data.columns)]
df['Well Name'] = df['Well Name'].astype('category')
df.head(10)



In [37]:

    
# add some features based on the well data. 

# nb points : can be correlated with how soft soil is ? 
print("session")
sessionsize = df.groupby(["Well Name",'Formation']).size().reset_index()
sessionsize.columns =  ["Well Name",'Formation','formation_size']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

# depth : 
print("depth")
sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].min().reset_index()
sessionsize.columns =  ["Well Name",'Formation','minimum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

sessionsize = df.groupby(["Well Name",'Formation'])["Depth"].max().reset_index()
sessionsize.columns =  ["Well Name",'Formation','maximum_depth']
df = pd.merge(df,sessionsize,how='left',on = ["Well Name",'Formation'])

df['formation_depth'] = df["maximum_depth"] - df["minimum_depth"]

df["soft_indic"] = df['formation_depth'] / df["formation_size"]

# add avgs of feat
print("add avgs of feat")
list_to_avg = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_avg : 
    df[val + "_min"] = df.groupby(["Well Name",'Formation'])[val].transform(np.min)
    df[val + "_max"] = df.groupby(["Well Name",'Formation'])[val].transform(np.max)
    df[val + "_mean"] = df.groupby(["Well Name",'Formation'])[val].transform(np.mean)
    df[val + "_var"] = df.groupby(["Well Name",'Formation'])[val].transform(np.var) 

# add distances feat. = an attempt at regulariation.
print("add distances feat.")
for val in list_to_avg : 
    df[val + "_min_dist"] = df[val] -df[val + "_min"]
    df[val + "_max_dist"] =  df[val] -df[val + "_max"]
    df[val + "_mean_dist"] =  df[val] -df[val + "_mean"]









    



session
depth
add avgs of feat
add distances feat.



In [38]:

    
# add lag and lead !
print("lag lead")
list_to_lag = ['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
for val in list_to_lag:
    for lag in range(1,11):
        df[val+'_lag_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=lag)
        df[val+'_lead_'+str(lag)]=df[val]-df.groupby("Well Name")[val].shift(periods=-lag)

# adding some Formation lag and lead. 
for lag in range(1,3):
    df['Formation'+'_lag_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=lag)
    df['Formation'+'_lead_'+str(lag)]=df.groupby("Well Name")['Formation'].shift(periods=-lag)
    df['Formation'+'_lag_'+str(lag) + 'equal'] = (df['Formation'+'_lag_'+str(lag)] == df["Formation"]).astype(int)
    df['Formation'+'_lead_'+str(lag) + 'equal'] = (df['Formation'+'_lead_'+str(lag)] == df["Formation"]).astype(int)









    



lag lead



In [32]:

    
type(df)









    Out[32]:





NoneType



In [39]:

    
#df = df.fillna(-9999)
df = df.fillna(method="bfill")
df = df.fillna(method="ffill")



In [40]:

    
df.shape









    Out[40]:





(4062, 241)



In [41]:

    
[c for c in df.columns if "Formation" in c]









    Out[41]:





['Formation',
 'Formation_lag_1',
 'Formation_lead_1',
 'Formation_lag_1equal',
 'Formation_lead_1equal',
 'Formation_lag_2',
 'Formation_lead_2',
 'Formation_lag_2equal',
 'Formation_lead_2equal']



In [59]:

    
tokeep =['Facies','origin','Formation','Well Name','Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
nums = ['Depth','GR','ILD_log10','DeltaPHI','PHIND','PE','NM_M','RELPOS']
tokeep = tokeep + [x+'_lag_1' for x in nums] +[x+'_lead_1' for x in nums]
df = df[tokeep]



In [60]:

    
# count vectorizer formation
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

counts = cv.fit_transform(df['Formation'].values)



In [61]:

    
cols = cv.get_feature_names()



In [62]:

    
counts = pd.DataFrame(counts.toarray(),columns = cols)
df = df.drop('Formation',axis = 1)
df = pd.concat([df,counts],axis=1)
df.shape









    Out[62]:





(4062, 35)



In [44]:

    
df['Formation']









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-44-edfc602003cf> in <module>()
----> 1 df['Formation']

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2069         result = self._constructor(self._data.get(key))
   2070         if result.columns.is_unique:
-> 2071             result = result[key]
   2072 
   2073         return result

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3539 
   3540             if not isnull(item):
-> 3541                 loc = self.items.get_loc(item)
   3542             else:
   3543                 indexer = np.arange(len(self.items))[isnull(self.items)]

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4443)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4289)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13733)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13687)()

KeyError: 'Formation'



In [43]:

    
from sklearn.feature_extraction.text import CountVectorizer
list_formation = ['Formation','Formation_lag_1','Formation_lead_1','Formation_lag_2','Formation_lead_2']
for l in list_formation:
    cv = CountVectorizer()
    counts = cv.fit_transform(df[l].values)
    cols = cv.get_feature_names()
    counts = pd.DataFrame(counts.toarray(),columns = cols)
    df = df.drop(l,axis = 1)
    df = pd.concat([df,counts],axis=1)









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-43-bb83c4d92138> in <module>()
      7 for l in list_formation:
      8     cv = CountVectorizer()
----> 9     counts = cv.fit_transform(df[l].values)
     10     cols = cv.get_feature_names()
     11     counts = pd.DataFrame(counts.toarray(),columns = cols)

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2069         result = self._constructor(self._data.get(key))
   2070         if result.columns.is_unique:
-> 2071             result = result[key]
   2072 
   2073         return result

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3539 
   3540             if not isnull(item):
-> 3541                 loc = self.items.get_loc(item)
   3542             else:
   3543                 indexer = np.arange(len(self.items))[isnull(self.items)]

/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4443)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4289)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13733)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13687)()

KeyError: 'Formation'

CV performance



In [63]:

    
# params
max_depth = 8 
n_estimators = 2000
clf = RandomForestClassifier(max_depth = max_depth,n_estimators=n_estimators)



In [65]:

    
ytrain = df[(df['origin']=='train')&(df['Well Name']<>'CHURCHMAN BIBLE')]['Facies']
yvalid = df[(df['origin']=='train')&(df['Well Name']=='CHURCHMAN BIBLE')]['Facies']
xtrain = df[(df['origin']=='train')&(df['Well Name']<>'CHURCHMAN BIBLE')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='train')&(df['Well Name']=='CHURCHMAN BIBLE')].drop(['Well Name','origin','Facies'],axis=1)



In [66]:

    
clf.fit(xtrain,ytrain)









    Out[66]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [68]:

    
preds = clf.predict(xvalid)



In [69]:

    
from sklearn.metrics import classification_report
print classification_report(yvalid, preds)









    



             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00         8
        2.0       0.66      0.62      0.64        56
        3.0       0.61      0.71      0.65        51
        4.0       0.33      0.69      0.45        13
        5.0       0.54      0.67      0.60        30
        6.0       0.58      0.68      0.63        87
        7.0       0.00      0.00      0.00        34
        8.0       0.55      0.48      0.51        75
        9.0       0.81      1.00      0.89        50

avg / total       0.55      0.61      0.57       404







    



/Users/alexandrecombessie/anaconda2/lib/python2.7/site-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

Apply to test



In [70]:

    
# this time let's use all the training set 
ytrain = df[(df['origin']=='train')]['Facies']
yvalid = df[(df['origin']=='test')]['Facies']
xtrain = df[(df['origin']=='train')].drop(['Well Name','origin','Facies'],axis=1)
xvalid = df[(df['origin']=='test')].drop(['Well Name','origin','Facies'],axis=1)



In [71]:

    
clf.fit(xtrain,ytrain)









    Out[71]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [73]:

    
preds = clf.predict(xvalid.values)



In [173]:

    
# preds



In [74]:

    
xvalid['Facies']=preds
xvalid.to_csv('XmasPreds.csv')

	Facies	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
0	3	A1 SH	SHRIMPLIN	2793.0	77.45	0.664	9.9	11.915	4.6	1	1.000
1	3	A1 SH	SHRIMPLIN	2793.5	78.26	0.661	14.2	12.565	4.1	1	0.979
2	3	A1 SH	SHRIMPLIN	2794.0	79.05	0.658	14.8	13.050	3.6	1	0.957
3	3	A1 SH	SHRIMPLIN	2794.5	86.10	0.655	13.9	13.115	3.5	1	0.936
4	3	A1 SH	SHRIMPLIN	2795.0	74.58	0.647	13.5	13.300	3.4	1	0.915
5	3	A1 SH	SHRIMPLIN	2795.5	73.97	0.636	14.0	13.385	3.6	1	0.894
6	3	A1 SH	SHRIMPLIN	2796.0	73.72	0.630	15.6	13.930	3.7	1	0.872
7	3	A1 SH	SHRIMPLIN	2796.5	75.65	0.625	16.5	13.920	3.5	1	0.830
8	3	A1 SH	SHRIMPLIN	2797.0	73.79	0.624	16.2	13.980	3.4	1	0.809
9	3	A1 SH	SHRIMPLIN	2797.5	76.89	0.615	16.9	14.220	3.5	1	0.787

	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
0	A1 SH	STUART	2808.0	66.276	0.630	3.3	10.65	3.591	1	1.000
1	A1 SH	STUART	2808.5	77.252	0.585	6.5	11.95	3.341	1	0.978
2	A1 SH	STUART	2809.0	82.899	0.566	9.4	13.60	3.064	1	0.956
3	A1 SH	STUART	2809.5	80.671	0.593	9.5	13.25	2.977	1	0.933
4	A1 SH	STUART	2810.0	75.971	0.638	8.7	12.35	3.020	1	0.911
5	A1 SH	STUART	2810.5	73.955	0.667	6.9	12.25	3.086	1	0.889
6	A1 SH	STUART	2811.0	77.962	0.674	6.5	12.45	3.092	1	0.867
7	A1 SH	STUART	2811.5	83.894	0.667	6.3	12.65	3.123	1	0.844
8	A1 SH	STUART	2812.0	84.424	0.653	6.7	13.05	3.121	1	0.822
9	A1 SH	STUART	2812.5	83.160	0.642	7.3	12.95	3.127	1	0.800

	Facies	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS	origin
0	NaN	A1 SH	STUART	2808.0	66.276	0.630	3.3	10.65	3.591	1	1.000	test
1	NaN	A1 SH	STUART	2808.5	77.252	0.585	6.5	11.95	3.341	1	0.978	test
2	NaN	A1 SH	STUART	2809.0	82.899	0.566	9.4	13.60	3.064	1	0.956	test
3	NaN	A1 SH	STUART	2809.5	80.671	0.593	9.5	13.25	2.977	1	0.933	test
4	NaN	A1 SH	STUART	2810.0	75.971	0.638	8.7	12.35	3.020	1	0.911	test
5	NaN	A1 SH	STUART	2810.5	73.955	0.667	6.9	12.25	3.086	1	0.889	test
6	NaN	A1 SH	STUART	2811.0	77.962	0.674	6.5	12.45	3.092	1	0.867	test
7	NaN	A1 SH	STUART	2811.5	83.894	0.667	6.3	12.65	3.123	1	0.844	test
8	NaN	A1 SH	STUART	2812.0	84.424	0.653	6.7	13.05	3.121	1	0.822	test
9	NaN	A1 SH	STUART	2812.5	83.160	0.642	7.3	12.95	3.127	1	0.800	test