In [136]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
In [125]:
sample_data = pd.read_csv("data/sample_data.csv")
costs_data = pd.read_csv("data/costs_data.csv")
auction_parcels = pd.read_csv("data/auction_parcels.csv")
elevation_data = pd.read_csv("data/elevation_data.csv")
In [126]:
sample_data[["gold_available"]].head()
Out[126]:
We first enriched the sample data with the elevation feature, then we mapped the continuous values to one of the 2 x 3 elevation categories for fixed and variable costs:
In [127]:
elevation = elevation_data.drop(["Easting","Northing"],axis = 1)
sample = sample_data.merge(elevation, on="parcel_id")
In [128]:
def map_to_fixed_elevation(row, threshold):
if row["elevation"] < 0:
return 0
elif row["elevation"] > 0 and row["elevation"] <= threshold:
return 1
elif row["elevation"] > threshold:
return 2
In [129]:
sample["fixed_cost"] = sample.apply(lambda row: map_to_fixed_elevation(row, 500), axis = 1)
sample["variable_cost"] = sample.apply(lambda row: map_to_fixed_elevation(row, 700), axis = 1)
sample.describe()
Out[129]:
We visualised the frequency of parcels w.r.t. the amount of gold available and the elevation levels in order to get better domain knowledge of the characteristics of gold mine.
In [130]:
# gplot = sample.plot(y="gold_available",kind='hist',title="Gold Distribution")
# gplot1 = sample.plot(y="elevation",kind='hist',title="Elevation of gold mines")
# gplot.set_xlabel("Amount of gold")
# gplot1.set_xlabel("Elevation")
data1 = [
go.Histogram(
x=sample['gold_available'],
autobinx=False,
xbins=dict(
start=0,
end=30000,
size=3000
)
)
]
layout1 = go.Layout(
title='Gold Distribution',
xaxis=dict(
title='gold_available'
),
yaxis=dict(
title='Frequency'
),
#barmode='overlay',
bargap=0.05,
#bargroupgap=0.3
)
fig1 = go.Figure(data=data1, layout=layout1)
py.iplot(fig1)
Out[130]:
In [44]:
data2 = [
go.Histogram(
x=sample['elevation'],
autobinx=False,
xbins=dict(
start=-1600,
end=1600,
size=300
)
)
]
layout2 = go.Layout(
title='Elevation of gold mines',
xaxis=dict(
title='Elevation'
),
yaxis=dict(
title='Frequency'
),
#barmode='overlay',
bargap=0.05,
#bargroupgap=0.3
)
fig2 = go.Figure(data=data2, layout=layout2)
py.iplot(fig2)
Out[44]:
In order to decide which parcels we wanted to place a bid on, we used clustering to first continue the previous data exploration. The elbow method enabled us to retrieve a value of K = 3 to run the K-means algorithm. We decided to build clusters along 2 characteristics features: gold_available and elevation.
In [131]:
# choosing K
sample_databis = sample.loc[:,["gold_available","elevation"] ]#, "Northing","gold_available","fixed_cost","variable_cost"]] ,"Northing","gold_available","fixed_cost","variable_cost"
hpc = sample_databis.values
k_range = range(1, 14)
k_means_var = [KMeans(n_clusters=k).fit(hpc) for k in k_range] # fit kmeans model for each n_clusters
centroids = [X.cluster_centers_ for X in k_means_var] # pull out centroids of cluster
k_euclid = [cdist(hpc, cent, 'euclidean') for cent in centroids] # euclidian distance from each point to each centroid
dist = [np.min(ke, axis=1) for ke in k_euclid]
wcss = [sum(d**2) for d in dist] # within-cluster sum of squares
tss = sum(pdist(hpc)**2)/hpc.shape[0] # total sum of squares
bss = tss - wcss # between-cluster sum of squares
# Create a trace
trace = go.Scatter(
x = k_range,
y = bss
)
# Edit the layout
layout3 = dict(title = 'Elbow method',
xaxis = dict(title = 'K'),
yaxis = dict(title = 'Beetween-cluster sum of squares'),
)
# Plot and embed in ipython notebook!
fig3 = dict(data=[trace], layout=layout3)
py.iplot(fig3, filename='Elbow method')
Out[131]:
In [132]:
K = 3
kmeans = KMeans(n_clusters=K).fit_predict(sample_databis)
# sample_data["cluster_id"] = kmeans
sample["cluster_id"] = kmeans
In [133]:
# statistics summary of the clustered data
sample.loc[:,["cluster_id","elevation","gold_available"]].groupby("cluster_id").describe()
Out[133]:
In [134]:
# retrieving the cluster centers coordinates
centers = KMeans(n_clusters=K).fit(sample_databis).cluster_centers_
centers
Out[134]:
Given the provided data and the previous cluster analysis, we first plotted the auctions of the 3 clusters. Then, we foccused on the 3rd cluster category, i.e. with medium elevation values for reduced exploitation costs and high amount of gold (gold_amount > 20000 oz). We plotted the cluster 3 along with the available parcels for auction. That way, we were able to determine the parcels which were the most profitable to invest in.
In fact, our bidding strategy was to invest all our money into the 5 most profitable parcels according to our estimations.
In [139]:
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5, forward=True)
plt.axis([0.0,150.0, 0.0,150.0])
for i, col in zip(range(0,K),['green','orange','blue','red','black']):
#plt.scatter(sample_data[sample_data["cluster_id"]==i].loc[:,"Easting"],sample_data[sample_data["cluster_id"]==i].loc[:,"Northing"],c=col,marker='o')
plt.scatter(sample[sample["cluster_id"]==i].loc[:,"Easting"],sample[sample["cluster_id"]==i].loc[:,"Northing"],c=col,marker='o')
for i in range(0,K):
plt.scatter(centers[i][0],centers[i][1],s=300,c='black',marker='*')
In [447]:
# aument costs _data
regr = linear_model.LinearRegression()
X_train = costs_data.loc[:10, ["gold_amount"]]
X_test = costs_data.loc[11:, ["gold_amount"]]
for elev in ["low", "med", "high"]:
y_train = costs_data.loc[:10,elev]
regr.fit(X_train,y_train)
pred = regr.predict(X_test)
costs_data.loc[11:,elev] = pred
A good cluster has elevation medium because lowest fixed costs highest gold amount.
In [477]:
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18.5, 10.5, forward=True)
plt.axis([0.0,150.0, 0.0,150.0])
plt.scatter(sample[sample["cluster_id"]==2].loc[:,"Easting"],sample[sample["cluster_id"]==2].loc[:,"Northing"],c=col,marker='o')
for index, parcel in auction_parcels.iterrows():
plt.scatter(auction_parcels['Easting'], auction_parcels['Northing'],color='red',label=str(auction_parcels['parcel_id']))
In [507]:
l= []
trace = go.Scatter(
x= auction_parcels['Easting'],
y= auction_parcels['Northing'],
mode= 'markers+text',
marker= dict(size= 5,
line= dict(width=1),
opacity= 0.3,
color = 'red'
),
#,name= y[i],
text= auction_parcels['parcel_id'])
trace2 = go.Scatter(
x = sample[sample["cluster_id"]==2].loc[:,"Easting"],
y = sample[sample["cluster_id"]==2].loc[:,"Northing"],
mode= 'markers',
marker= dict(size= 3,
line= dict(width=1),
opacity= 0.5,
color = 'blue'
)
)
l.append(trace)
l.append(trace2)
fig= go.Figure(data=l)
py.iplot(fig)
Out[507]:
This graphical analysis resulted in the following list of most interesting parcels to invest in:
firstfilter = [10226, 7837, 19114, 20194, 11489,10905,1790,13249,14154,12810,1614,12221]
In [51]:
from sklearn.preprocessing import normalize
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
One strategy would have to construct the minerals and ores as they are not present in the auction data, by using clustering on those features for example, assigning each parcel to a cluster and completing the auction parcel dataframe with the average value for each feature of each cluster.
For simplicity we chose to only use the rocks categories features. Therefore we trained our classification models by extracting the previous features and normized those continuous values.
In [178]:
features = auction_parcels.columns.tolist()
features.append("gold_available")
features.pop(1)
features.pop(1)
features.pop(0)
sample2 = sample_data.merge(elevation, on="parcel_id")
sample_set = sample2.loc[:,features]
sample_set.loc[:,["elevation","Arkonor","Mercoxit","Bistot","Crokite"]] = normalize(sample_set.loc[:,["elevation","Arkonor","Mercoxit","Bistot","Crokite"]], axis =0)
In [98]:
def label_gold_amount(row, threshold):
if row["gold_available"] < threshold:
return 0
elif row["gold_available"] >= threshold:
return 1
sample_set["gold_available"] = sample_set.apply(lambda row: label_gold_amount(row, 10000), axis = 1)
In [104]:
def label_gold_amount(row):
if row["gold_available"]> 0 and row["gold_available"] <= 10000:
return 0
elif row["gold_available"]> 10000 and row["gold_available"] <= 20000:
return 1
else:
return 2
sample_set["gold_available"] = sample_set.apply(lambda row: label_gold_amount(row), axis = 1)
In [106]:
#sample_set
X_train, X_test, y_train, y_test = train_test_split(sample_set.iloc[:,0:5],sample_set.iloc[:,5],test_size=0.05)
In [107]:
logreg = LogisticRegression(multi_class='ovr') #
param_range = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
gs = GridSearchCV(estimator=logreg,param_grid= {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1','l2'] },scoring='accuracy',cv=5,n_jobs=-1)
In [108]:
gs.fit(X_train,y_train)
Out[108]:
In [109]:
gs.best_score_
Out[109]:
In [110]:
gs.best_params_
Out[110]:
In [111]:
print('Test accuracy: %.2f %%' % (gs.score(X_test, y_test)*100))
In [124]:
sum(y_train == 2)
Out[124]:
In [113]:
# X_test
auction_parcels_processedtmp = auction_parcels
auction_parcels_processedtmp.loc[:,["elevation","Arkonor","Mercoxit","Bistot","Crokite"]] = normalize(auction_parcels.loc[:,["elevation","Arkonor","Mercoxit","Bistot","Crokite"]], axis =0)
auction_parcels_processed = auction_parcels_processedtmp.loc[:,["parcel_id","elevation","Arkonor","Mercoxit","Bistot","Crokite"]]
auction_parcels_processed.head()
Out[113]:
In [116]:
gs.predict(auction_parcels_processed.iloc[:,1:])
Out[116]:
In [120]:
svc = SVC(random_state=1)
param_range2 = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid2 = [{'C': param_range2,'kernel': ['linear']},
{'C': param_range2,'kernel': ['rbf']}]
gs2 = GridSearchCV(estimator=svc,param_grid=param_grid2,scoring='accuracy',cv=3,n_jobs=-1)
In [121]:
gs2.fit(X_train,y_train)
Out[121]:
In [122]:
gs2.predict(auction_parcels_processed.iloc[:,1:])
Out[122]:
In [225]:
sample2 = sample_data.merge(elevation, on="parcel_id")
features2 = features
sample_databis2 = sample2.loc[:,features2]
kmeans2 = KMeans(n_clusters=5).fit_predict(sample_databis2)
sample2["cluster_id"] = kmeans2
In [226]:
# statistics summary of the clustered data
listo = sample2.columns.tolist()
listo.remove('Northing')
all_features = set(listo)
sample3 = sample2
reference = sample3.loc[:,all_features - set(features2)].groupby('cluster_id').mean()
In [1]:
# filtre = ['Arkonor','Mercoxit','Bistot','Crokite', 'elevation']
# filtros = ['Arkonor','Mercoxit','Bistot','Crokite', 'elevation', 'cluster_id']
# refera = sample3.loc[:, filtros].groupby('cluster_id').describe()
# def assign_cluster(row):
# for i in xrange(5):
# for val in filtre:
# print(i + " " + val )
# # if(row[val] > refera.loc[i, :].min()[val] and row[val] < refera.loc[i, :].max()[val]):
# # row['cluster_id'] = i
# auction_pracels2 = auction_parcels
# auction_pracels2.apply(lambda row: assign_cluster(row), axis = 1)
# # refera.loc[0, :].min()
In [ ]:
missing_list = reference.columns.tolist()
def enrich_auction(row):
for i in xrange(5):
if row['cluster_id'] == i:
row[missing_list] = reference.loc[i, :]
In [236]:
#acution_pracels2 = auction_parcels
#acution_pracels2.apply(lambda row: enrich_auction(row), axis = 1)
In [259]:
# sample2.loc[:,features.append['cluster_id']]
In [264]:
refera.min()
Out[264]:
In [302]:
for i in xrange(5):
for val in filtre:
print(str(i) + " " + val )
In [ ]: