In [1]:
#importando bibliotecas que iremos usar
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import warnings
import os
from numpy import arange
from scipy.stats import skew
from sklearn.utils import shuffle
from scipy.stats.stats import pearsonr
from sklearn import cross_validation, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet as ElasticNetImpl
from sklearn.preprocessing import LabelEncoder
from subprocess import check_output
from sklearn.utils import shuffle
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV
warnings.filterwarnings('ignore')
In [2]:
fifa = pd.read_csv('CompleteDataset.csv')
In [3]:
def extrai(value):
out = value.replace('€', '')
if 'M' in out:
out = float(out.replace('M', ''))*1000000
elif 'K' in value:
out = float(out.replace('K', ''))*1000
return float(out)
In [4]:
fifa['Value'] = fifa['Value'].apply(lambda x: extrai(x))
fifa['Wage'] = fifa['Wage'].apply(lambda x: extrai(x))
In [24]:
fifa = shuffle(fifa)
In [25]:
train = fifa.iloc[:15000]
test = fifa.iloc[15000:]
In [26]:
x = [ 'Potential', 'Overall', 'Wage', 'Age', 'Special'] #atributos utilizados para calcular um value
y = ['Value'] #objetivo
In [27]:
var = 'Value'
data = pd.concat([train['Overall'], train[var]], axis=1)
data.plot.scatter(x=var, y='Overall', ylim=(60,100), xlim=(0,150000000));
In [28]:
var = 'Wage'
data = pd.concat([train['Overall'], train[var]], axis=1)
data.plot.scatter(x=var, y='Overall', ylim=(40,100), xlim=(0,600000));
In [29]:
corr = fifa.drop('ID', axis = 1).corr()
fig = plt.figure(figsize=(20,16))
ax = sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values,
linewidths=0.25,
vmax=1.0,
square=True,
cmap = 'PuBu',
linecolor='black',
annot=False
)
In [30]:
train.drop(["Photo", "Flag","Club Logo","Name"],1,inplace=True)
In [31]:
train.drop("ID",1,inplace=True)
In [32]:
l_encode = LabelEncoder()
obj_feat = ["Club", "Nationality","Preferred Positions"]
for var in obj_feat:
train[var] = l_encode.fit_transform(train[var].astype(str))
train.shape
Out[32]:
In [33]:
def clean_values(x):
try:
if len(x)>2:
y = x[:2]
return y
else:
return x
except TypeError:
return x
columns_to_clean = [col for col in train.columns if col not in ["Age","Nationality",
"Overall","Potential",
"Club","Value","Wage",
"Special"]]
for col in columns_to_clean:
train[col] = train[col].apply(lambda x : clean_values(x))
In [34]:
train = train.dropna(axis=1, how="any")
In [35]:
def modelfit(alg, dtrain, features, performCV=True, printFeatureImportance=True, cv_folds=10):
alg.fit(dtrain[features],dtrain["Value"] )
dtrain_predictions = alg.predict(dtrain[features])
cv_score = cross_validation.cross_val_score(alg, dtrain[features], dtrain["Value"], cv=cv_folds,
scoring='neg_mean_squared_error')
cv_score = np.sqrt(np.abs(cv_score))
print ("\nModel Report")
print ("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error(dtrain["Value"], dtrain_predictions)))
print ("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score),
np.std(cv_score),np.min(cv_score),
np.max(cv_score)))
if printFeatureImportance:
feat_imp = pd.Series(alg.feature_importances_, features).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
In [36]:
#Grau de correlação entre outras variáveis, em relação ao valor
features = [i for i in train.columns if i != "Value"]
target = "Value"
gbm0 = GradientBoostingRegressor(random_state=7)
modelfit(gbm0, train, features)
In [37]:
#Jogadores com salário igual a 0.8K foram lidos como sendo 0. Para corrigir isso, colocamos valores e salários abaixo de 1K como
#sendo iguais a 1K (arredondamento pra cima).
train.Value[train.Value==0]=1
train.Wage[train.Wage==0]=1
In [38]:
sns.distplot(np.log(train['Value']), fit=norm);
fig = plt.figure()
res = stats.probplot(np.log(train['Value']), plot=plt)
In [43]:
def ridge_regression(train, x, alpha):
ridgereg = Ridge(alpha=alpha,normalize=True)
ridgereg.fit(train[x],train['Value'])
y_pred = ridgereg.predict(train[x])
return(y_pred)
In [45]:
ridge = ridge_regression(train, x, 1e-20)
plt.plot(train['Value'],ridge,'.', color="blue")
plt.axis([0, 130000000, 0, 130000000])
plt.xlabel("Valor real")
plt.ylabel("Valor premeditado")
plt.show()
In [41]:
r_R = ridge_regression(test, x, 1e-20)
print((mean_squared_error(test['Value'],r_R))**(1/2))
In [51]:
ridgetest = ridge_regression(test, x, 1e-20)
plt.plot(test['Value'],ridgetest,'.', color="red")
plt.axis([0, 130000000, 0, 130000000])
plt.xlabel("Valor real")
plt.ylabel("Valor premeditado")
plt.show()
In [ ]: