In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
filenames = [
"7_Well239.xlsx","8_Well425.xlsx","9_Well445.xlsx","10_Well451.xlsx","11_Well496.xlsx",
"12_Well510.xlsx"
];
train_files = filenames[:5]

In [3]:
COLUMNS = ["DEPTH","RHOB","NPHI","GR","DT"]
FEATURES = ["RHOB","NPHI","GR"]

In [4]:
df = pd.DataFrame()

In [5]:
for item in train_files:
    cur_df = pd.read_excel('MTP/'+item, 'Sheet1', index_col=None)
    df = df.append(cur_df[COLUMNS],ignore_index=True)

In [6]:
print df.shape
df = df[(df.DT!=-999.25) & (df.RHOB!=-999.25) & (df.NPHI!=-999.25) & (df.GR!=-999.25)]
print df.shape


(14143, 5)
(8809, 5)

In [7]:
df.head(5)


Out[7]:
DEPTH RHOB NPHI GR DT
3 2775.5088 2.435110 0.346101 40.670404 98.957324
4 2775.6612 2.445212 0.347706 41.349963 99.668471
5 2775.8136 2.452603 0.352515 42.251214 100.067070
6 2775.9660 2.453797 0.361714 41.875382 100.234240
7 2776.1184 2.447196 0.371803 41.119991 100.398700

In [8]:
df.head(5)


Out[8]:
DEPTH RHOB NPHI GR DT
3 2775.5088 2.435110 0.346101 40.670404 98.957324
4 2775.6612 2.445212 0.347706 41.349963 99.668471
5 2775.8136 2.452603 0.352515 42.251214 100.067070
6 2775.9660 2.453797 0.361714 41.875382 100.234240
7 2776.1184 2.447196 0.371803 41.119991 100.398700

In [9]:
df = df.sort_values(by="DEPTH")

In [10]:
df.head(10)


Out[10]:
DEPTH RHOB NPHI GR DT
3535 2673.7056 2.360000 0.430734 64.326297 105.15000
3536 2673.8580 2.359511 0.437368 60.221711 105.15000
3537 2674.0104 2.323572 0.458727 62.227375 105.15000
3538 2674.1628 2.306771 0.489539 62.542708 105.15000
3539 2674.3152 2.304319 0.482252 61.706956 105.15000
3540 2674.4676 2.308664 0.459852 59.877785 105.84152
3541 2674.6200 2.264486 0.437299 58.674358 106.09804
3542 2674.7724 2.262016 0.423855 58.576253 103.32205
3543 2674.9248 2.273700 0.419237 55.232853 103.59746
3544 2675.0772 2.267772 0.424059 53.800148 103.55041

In [11]:
base = df.iloc[0,0]
end = float(df.tail(1)["DEPTH"])

In [12]:
depth_coeff = []

In [13]:
from sklearn import linear_model

start = base
last_end = 0.0

while start < end:
    
    cur_df = df[(df.DEPTH<=(start+10)) & (df.DEPTH>(last_end))]
    
    x_train = cur_df[FEATURES]
    y_train = cur_df["DT"]
    
    clf = linear_model.LinearRegression(normalize=True)
    clf.fit(x_train,y_train)
    depth_coeff.append(clf)
    
    
    last_end = start
    start = start+10

In [14]:
def get_index(depth):
    index=-1
    start=base
    while depth>start:
        index = index+1
        start += 10
    
    return index

Testing


In [15]:
test_file = filenames[5]

In [16]:
test_file


Out[16]:
'12_Well510.xlsx'

In [17]:
test_df = pd.read_excel('MTP/'+test_file, 'Sheet1', index_col=None)
test_df = test_df[COLUMNS]
test_df = test_df[(test_df.DT!=-999.25) & (test_df.RHOB!=-999.25) & (test_df.NPHI!=-999.25) & (test_df.GR!=-999.25)]

In [18]:
y_true = []
y_pred = []
for index,row in test_df.iterrows():
    
    cur_depth = row["DEPTH"]
    
    x_test = row[FEATURES]
    y_actual = row["DT"]
    
    y_true.append(y_actual)
    y_pred.append(clf.predict(x_test)[0])
    
    index = get_index(cur_depth)
    clf = depth_coeff[index]
    
#     print clf.predict(x_test),y_actual

R-squared metric


In [19]:
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)


Out[19]:
0.051253988808328654

In [20]:
rows = []
for clf in depth_coeff:
    row = []
    for item in clf.coef_:
        row.append(item)
    row.append(clf.intercept_)
    rows.append(row)

In [21]:
df = pd.DataFrame()
df = df.append(rows)
df.to_csv('coefficients.csv')

In [ ]: