In [136]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

Data Collection


In [125]:
sample_data = pd.read_csv("data/sample_data.csv")
costs_data = pd.read_csv("data/costs_data.csv")
auction_parcels = pd.read_csv("data/auction_parcels.csv")
elevation_data = pd.read_csv("data/elevation_data.csv")

In [126]:
sample_data[["gold_available"]].head()


Out[126]:
gold_available
0 3901.517878
1 17523.530328
2 4537.823944
3 21601.105930
4 21186.862192

Data Preprocessing and exploration

We first enriched the sample data with the elevation feature, then we mapped the continuous values to one of the 2 x 3 elevation categories for fixed and variable costs:

Elevation for Fixed Costs

  • Low Elevation: Elevation < 0m
  • Medium Elevation: 0m < Elevation <= 500m
  • High Elevation: Elevation > 500m

Elevation for Variable Costs

  • Low Elevation: Elevation < 0m
  • Medium Elevation: 0m < Elevation <= 700m
  • High Elevation: Elevation > 700m

In [127]:
elevation = elevation_data.drop(["Easting","Northing"],axis = 1)
sample = sample_data.merge(elevation, on="parcel_id")

In [128]:
def map_to_fixed_elevation(row, threshold): 
    if row["elevation"]  < 0: 
        return 0 
    elif row["elevation"] > 0 and row["elevation"]  <= threshold: 
        return 1 
    elif row["elevation"] > threshold: 
        return 2

In [129]:
sample["fixed_cost"] = sample.apply(lambda row: map_to_fixed_elevation(row, 500), axis = 1)
sample["variable_cost"] = sample.apply(lambda row: map_to_fixed_elevation(row, 700), axis = 1)
sample.describe()


Out[129]:
parcel_id Easting Northing Pyerite Mexallon Tritanium Megacyte Nocxium Isogen Veldspar ... Spudumain Gneiss Arkonor Mercoxit Bistot Crokite gold_available elevation fixed_cost variable_cost
count 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 ... 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000
mean 11261.941400 74.891400 75.077000 114.690044 108.480667 107.407900 111.709613 112.205520 114.327617 105.865743 ... 146.831750 113.603848 106.709769 111.338126 112.268864 108.443991 12374.911196 409.016811 1.215000 1.075200
std 6425.913782 43.521685 42.837907 22.387161 6.932708 9.674488 18.352131 9.765217 17.217109 5.258676 ... 21.673469 11.787317 7.383663 7.666121 11.790360 5.671461 6067.684876 503.698083 0.771034 0.703736
min 4.000000 0.500000 0.500000 19.856209 84.861319 66.911702 29.903756 75.707857 22.157819 91.864452 ... 51.141385 73.248565 76.066022 86.084987 62.569692 87.749107 2357.148816 -1359.020531 0.000000 0.000000
25% 5748.750000 37.500000 38.500000 100.682505 103.575101 101.260272 100.226674 105.278671 103.227686 102.217591 ... 131.925339 105.378566 101.736307 105.736392 104.255459 104.359193 7199.592697 62.409200 1.000000 1.000000
50% 11200.500000 74.500000 74.500000 112.257269 107.522212 106.145377 109.758230 110.890648 112.481474 105.049074 ... 145.573532 111.751614 105.857703 110.418789 110.704721 107.633955 11902.142387 411.031675 1.000000 1.000000
75% 16754.000000 112.500000 111.500000 127.363743 112.453893 112.600010 121.856395 117.995928 123.523854 108.912550 ... 160.569686 120.323985 110.831962 115.892401 118.899349 111.620794 17119.390322 760.320730 2.000000 2.000000
max 22498.000000 149.500000 149.500000 233.739627 140.378133 156.583507 199.787150 167.262641 201.763854 137.119403 ... 234.105254 172.571132 146.902636 149.420274 178.824065 136.771507 31142.774584 2078.685616 2.000000 2.000000

8 rows × 22 columns

We visualised the frequency of parcels w.r.t. the amount of gold available and the elevation levels in order to get better domain knowledge of the characteristics of gold mine.


In [130]:
# gplot = sample.plot(y="gold_available",kind='hist',title="Gold Distribution")
# gplot1 = sample.plot(y="elevation",kind='hist',title="Elevation of gold mines")
# gplot.set_xlabel("Amount of gold")
# gplot1.set_xlabel("Elevation")


data1 = [
    go.Histogram(
        x=sample['gold_available'],
        autobinx=False,
        xbins=dict(
        start=0,
        end=30000,
        size=3000
    )
    )
]

layout1 = go.Layout(
    title='Gold Distribution',
    xaxis=dict(
        title='gold_available'
    ),
    yaxis=dict(
        title='Frequency'
    ),
    #barmode='overlay',
    bargap=0.05,
    #bargroupgap=0.3
)

fig1 = go.Figure(data=data1, layout=layout1)
py.iplot(fig1)


Out[130]:

In [44]:
data2 = [
    go.Histogram(
        x=sample['elevation'],
         autobinx=False,
        xbins=dict(
        start=-1600,
        end=1600,
        size=300
    )   
    )
]

layout2 = go.Layout(
    title='Elevation of gold mines',
    xaxis=dict(
        title='Elevation'
    ),
    yaxis=dict(
        title='Frequency'
    ),
    #barmode='overlay',
    bargap=0.05,
    #bargroupgap=0.3
)

fig2 = go.Figure(data=data2, layout=layout2)
py.iplot(fig2)


Out[44]:

Clustering

In order to decide which parcels we wanted to place a bid on, we used clustering to first continue the previous data exploration. The elbow method enabled us to retrieve a value of K = 3 to run the K-means algorithm. We decided to build clusters along 2 characteristics features: gold_available and elevation.


In [131]:
# choosing K 
sample_databis = sample.loc[:,["gold_available","elevation"] ]#, "Northing","gold_available","fixed_cost","variable_cost"]] ,"Northing","gold_available","fixed_cost","variable_cost"
hpc = sample_databis.values
k_range = range(1, 14)
k_means_var = [KMeans(n_clusters=k).fit(hpc) for k in k_range] # fit kmeans model for each n_clusters
centroids = [X.cluster_centers_ for X in k_means_var] # pull out centroids of cluster
k_euclid = [cdist(hpc, cent, 'euclidean') for cent in centroids] # euclidian distance from each point to each centroid
dist = [np.min(ke, axis=1) for ke in k_euclid]

wcss = [sum(d**2) for d in dist] # within-cluster sum of squares
tss = sum(pdist(hpc)**2)/hpc.shape[0] # total sum of squares

bss = tss - wcss # between-cluster sum of squares

# Create a trace
trace = go.Scatter(
    x = k_range,
    y = bss
)


# Edit the layout
layout3 = dict(title = 'Elbow method',
              xaxis = dict(title = 'K'),
              yaxis = dict(title = 'Beetween-cluster sum of squares'),
              )

# Plot and embed in ipython notebook!
fig3 = dict(data=[trace], layout=layout3)
py.iplot(fig3, filename='Elbow method')


Out[131]:

In [132]:
K = 3
kmeans = KMeans(n_clusters=K).fit_predict(sample_databis)
# sample_data["cluster_id"] = kmeans
sample["cluster_id"] = kmeans

In [133]:
# statistics summary of the clustered data 
sample.loc[:,["cluster_id","elevation","gold_available"]].groupby("cluster_id").describe()


Out[133]:
elevation gold_available
cluster_id
0 count 1330.000000 1330.000000
mean 404.379175 20444.422659
std 505.972661 2509.127777
min -920.923932 16758.978816
25% 38.539848 18330.207012
50% 396.151943 20182.501302
75% 755.828554 22208.352331
max 1883.578941 31142.774584
1 count 1931.000000 1931.000000
mean 413.914922 6197.714968
std 497.542097 1973.817489
min -1359.020531 2357.148816
25% 77.618960 4571.619880
50% 410.441422 6106.238442
75% 752.867611 7914.823022
max 1919.009149 9625.599539
2 count 1739.000000 1739.000000
mean 407.124805 13062.499277
std 508.965050 2014.127530
min -1176.944995 9643.477605
25% 61.109307 11358.963481
50% 421.039296 12993.520356
75% 774.653226 14763.022516
max 2078.685616 16742.645404

In [134]:
# retrieving the cluster centers coordinates 
centers = KMeans(n_clusters=K).fit(sample_databis).cluster_centers_
centers


Out[134]:
array([[  6235.12179977,    415.35985293],
       [ 20497.41668316,    405.28883626],
       [ 13144.20550214,    404.70233489]])

Graphical method

Given the provided data and the previous cluster analysis, we first plotted the auctions of the 3 clusters. Then, we foccused on the 3rd cluster category, i.e. with medium elevation values for reduced exploitation costs and high amount of gold (gold_amount > 20000 oz). We plotted the cluster 3 along with the available parcels for auction. That way, we were able to determine the parcels which were the most profitable to invest in.

In fact, our bidding strategy was to invest all our money into the 5 most profitable parcels according to our estimations.


In [139]:
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5, forward=True)
plt.axis([0.0,150.0, 0.0,150.0])
for i, col in zip(range(0,K),['green','orange','blue','red','black']):
      #plt.scatter(sample_data[sample_data["cluster_id"]==i].loc[:,"Easting"],sample_data[sample_data["cluster_id"]==i].loc[:,"Northing"],c=col,marker='o')
    plt.scatter(sample[sample["cluster_id"]==i].loc[:,"Easting"],sample[sample["cluster_id"]==i].loc[:,"Northing"],c=col,marker='o')

for i in range(0,K): 
     plt.scatter(centers[i][0],centers[i][1],s=300,c='black',marker='*')



In [447]:
# aument costs _data 
regr = linear_model.LinearRegression()
X_train  = costs_data.loc[:10, ["gold_amount"]]
X_test =  costs_data.loc[11:, ["gold_amount"]]
for elev in ["low", "med", "high"]:
    y_train  = costs_data.loc[:10,elev]
    regr.fit(X_train,y_train)
    pred = regr.predict(X_test)
    costs_data.loc[11:,elev] = pred

A good cluster has elevation medium because lowest fixed costs highest gold amount.


In [477]:
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 10.5, forward=True)
plt.axis([0.0,150.0, 0.0,150.0])
plt.scatter(sample[sample["cluster_id"]==2].loc[:,"Easting"],sample[sample["cluster_id"]==2].loc[:,"Northing"],c=col,marker='o')
for index, parcel in auction_parcels.iterrows():
    plt.scatter(auction_parcels['Easting'], auction_parcels['Northing'],color='red',label=str(auction_parcels['parcel_id']))



In [507]:
l= []
trace = go.Scatter(
        x= auction_parcels['Easting'],
        y= auction_parcels['Northing'],
        mode= 'markers+text',
 marker= dict(size= 5,
                    line= dict(width=1),
                    opacity= 0.3,
              color = 'red'
                   ),
    #,name= y[i],
        text= auction_parcels['parcel_id'])
trace2 = go.Scatter(
    x = sample[sample["cluster_id"]==2].loc[:,"Easting"], 
    y = sample[sample["cluster_id"]==2].loc[:,"Northing"],
     mode= 'markers',
 marker= dict(size= 3,
                    line= dict(width=1),
                    opacity= 0.5,
                  color = 'blue'
                   )
)
l.append(trace)
l.append(trace2)
fig= go.Figure(data=l)
py.iplot(fig)


Out[507]:

This graphical analysis resulted in the following list of most interesting parcels to invest in:

firstfilter = [10226, 7837, 19114, 20194, 11489,10905,1790,13249,14154,12810,1614,12221]

Let's predict whether a parcel contain a high amount of gold - WIP maybe


In [51]:
from sklearn.preprocessing import normalize
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

One strategy would have to construct the minerals and ores as they are not present in the auction data, by using clustering on those features for example, assigning each parcel to a cluster and completing the auction parcel dataframe with the average value for each feature of each cluster.

For simplicity we chose to only use the rocks categories features. Therefore we trained our classification models by extracting the previous features and normized those continuous values.


In [178]:
features = auction_parcels.columns.tolist()
features.append("gold_available")
features.pop(1)
features.pop(1)
features.pop(0)
sample2 = sample_data.merge(elevation, on="parcel_id")
sample_set = sample2.loc[:,features]
sample_set.loc[:,["elevation","Arkonor","Mercoxit","Bistot","Crokite"]] = normalize(sample_set.loc[:,["elevation","Arkonor","Mercoxit","Bistot","Crokite"]], axis =0)

In [98]:
def label_gold_amount(row, threshold): 
    if row["gold_available"]  < threshold: 
        return 0 
    elif row["gold_available"]  >= threshold: 
        return 1 
sample_set["gold_available"] = sample_set.apply(lambda row: label_gold_amount(row, 10000), axis = 1)

In [104]:
def label_gold_amount(row): 
    if row["gold_available"]> 0 and row["gold_available"]  <= 10000: 
        return 0 
    elif row["gold_available"]> 10000 and row["gold_available"]  <= 20000: 
        return 1 
    else:
        return 2 
sample_set["gold_available"] = sample_set.apply(lambda row: label_gold_amount(row), axis = 1)

In [106]:
#sample_set
X_train, X_test, y_train, y_test = train_test_split(sample_set.iloc[:,0:5],sample_set.iloc[:,5],test_size=0.05)

In [107]:
logreg = LogisticRegression(multi_class='ovr') # 
param_range = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
gs = GridSearchCV(estimator=logreg,param_grid= {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1','l2'] },scoring='accuracy',cv=5,n_jobs=-1)

In [108]:
gs.fit(X_train,y_train)


Out[108]:
GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [109]:
gs.best_score_


Out[109]:
0.52989473684210531

In [110]:
gs.best_params_


Out[110]:
{'C': 100, 'penalty': 'l1'}

In [111]:
print('Test accuracy: %.2f %%' % (gs.score(X_test, y_test)*100))


Test accuracy: 53.20 %

In [124]:
sum(y_train == 2)


Out[124]:
664

In [113]:
# X_test
auction_parcels_processedtmp = auction_parcels
auction_parcels_processedtmp.loc[:,["elevation","Arkonor","Mercoxit","Bistot","Crokite"]] = normalize(auction_parcels.loc[:,["elevation","Arkonor","Mercoxit","Bistot","Crokite"]], axis =0)
auction_parcels_processed = auction_parcels_processedtmp.loc[:,["parcel_id","elevation","Arkonor","Mercoxit","Bistot","Crokite"]]
auction_parcels_processed.head()


Out[113]:
parcel_id elevation Arkonor Mercoxit Bistot Crokite
0 515 0.004863 0.141748 0.130815 0.134799 0.133601
1 914 0.164621 0.135439 0.134627 0.138784 0.142540
2 1538 0.263969 0.136843 0.142994 0.134596 0.127829
3 1790 0.099440 0.134950 0.141071 0.134163 0.136110
4 2416 0.332363 0.151359 0.150069 0.144281 0.141832

In [116]:
gs.predict(auction_parcels_processed.iloc[:,1:])


Out[116]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [120]:
svc = SVC(random_state=1)
param_range2 = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid2 = [{'C': param_range2,'kernel': ['linear']},
              {'C': param_range2,'kernel': ['rbf']}]
gs2 = GridSearchCV(estimator=svc,param_grid=param_grid2,scoring='accuracy',cv=3,n_jobs=-1)

In [121]:
gs2.fit(X_train,y_train)


Out[121]:
GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'kernel': ['linear'], 'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}, {'kernel': ['rbf'], 'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [122]:
gs2.predict(auction_parcels_processed.iloc[:,1:])


Out[122]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

Clustering 2


In [225]:
sample2 = sample_data.merge(elevation, on="parcel_id")
features2 = features
sample_databis2 = sample2.loc[:,features2]
kmeans2 = KMeans(n_clusters=5).fit_predict(sample_databis2)
sample2["cluster_id"] = kmeans2

In [226]:
# statistics summary of the clustered data 
listo = sample2.columns.tolist()
listo.remove('Northing')
all_features = set(listo)
sample3 = sample2
reference = sample3.loc[:,all_features - set(features2)].groupby('cluster_id').mean()

In [1]:
# filtre = ['Arkonor','Mercoxit','Bistot','Crokite', 'elevation']
# filtros = ['Arkonor','Mercoxit','Bistot','Crokite', 'elevation', 'cluster_id']
# refera = sample3.loc[:, filtros].groupby('cluster_id').describe()

# def assign_cluster(row): 
#     for i in xrange(5):
#         for val in filtre: 
#             print(i + " " + val )
#             # if(row[val] > refera.loc[i, :].min()[val] and row[val] < refera.loc[i, :].max()[val]):
#                 # row['cluster_id'] = i

# auction_pracels2 = auction_parcels 
# auction_pracels2.apply(lambda row: assign_cluster(row), axis = 1)
# # refera.loc[0, :].min()

In [ ]:
missing_list = reference.columns.tolist()
def enrich_auction(row): 
    for i in xrange(5):
        if row['cluster_id'] == i:
            row[missing_list] = reference.loc[i, :]

In [236]:
#acution_pracels2 = auction_parcels 
#acution_pracels2.apply(lambda row: enrich_auction(row), axis = 1)

In [259]:
# sample2.loc[:,features.append['cluster_id']]

In [264]:
refera.min()


Out[264]:
Arkonor         7.092485
Bistot         10.804418
Crokite         5.419970
Mercoxit        7.515567
elevation   -1359.020531
dtype: float64

In [302]:
for i in xrange(5):
      for val in filtre: 
            print(str(i) + " " + val )


0 Arkonor
0 Mercoxit
0 Bistot
0 Crokite
0 elevation
1 Arkonor
1 Mercoxit
1 Bistot
1 Crokite
1 elevation
2 Arkonor
2 Mercoxit
2 Bistot
2 Crokite
2 elevation
3 Arkonor
3 Mercoxit
3 Bistot
3 Crokite
3 elevation
4 Arkonor
4 Mercoxit
4 Bistot
4 Crokite
4 elevation

In [ ]: