In [106]:
%matplotlib inline
import pandas as pd
import numpy as np
import random as rnd
import seaborn as sns
import matplotlib.pyplot as plt
In [109]:
data = pd.read_csv('train.csv', header=None ,delimiter=";")
feature_names = ['usuario', 'palabra', 'palabraLeida', 'tiempoCaracter',
'hayErrPalabra', 'tiempoErrPalabra', 'numPalabra','tiempoPalabra', 'tamPalabra', 'caracter',
'falloCaracter', 'palabraCorrecta']
data.columns = feature_names
In [110]:
predict = pd.read_csv('predict.csv', header=None ,delimiter=";")
feature_names = ['usuario', 'palabra', 'palabraLeida', 'tiempoCaracter',
'hayErrPalabra', 'tiempoErrPalabra', 'numPalabra','tiempoPalabra', 'tamPalabra', 'caracter',
'falloCaracter', 'palabraCorrecta']
predict.columns = feature_names
In [111]:
data[data['caracter'] == 'Z']
Out[111]:
In [112]:
# Pasamos de boolean a un int, 1 para true y 0 para false
data["hayErrPalabra"] = data['hayErrPalabra'].map({False: 0, True: 1})
data["falloCaracter"] = data['falloCaracter'].map({False: 0, True: 1})
data["palabraCorrecta"] = data['palabraCorrecta'].map({False: 0, True: 1})
In [113]:
predict["hayErrPalabra"] = predict['hayErrPalabra'].map({False: 0, True: 1})
predict["falloCaracter"] = predict['falloCaracter'].map({False: 0, True: 1})
predict["palabraCorrecta"] = predict['palabraCorrecta'].map({False: 0, True: 1})
In [114]:
data["usuario"] = data["usuario"].str.strip()
In [115]:
predict["usuario"] = predict["usuario"].str.strip()
In [116]:
data["usuarioID"] = data['usuario'].map({"Cristhian": 0, "Jesus": 1})
In [117]:
predict["usuarioID"] = predict['usuario'].map({"Cristhian": 0, "Jesus": 1})
In [118]:
data['caracter'] = data[data['caracter'].between('A', 'Z', inclusive=True)]['caracter']
In [119]:
predict['caracter'] = predict[predict['caracter'].between('A', 'Z', inclusive=True)]['caracter']
In [334]:
d = {ni: indi for indi, ni in enumerate(set(data['palabra']))}
data['palabra'] = [d[ni] for ni in data['palabra']]
In [ ]:
d = {ni: indi for indi, ni in enumerate(set(predict['palabra']))}
predict['palabra'] = [d[ni] for ni in predict['palabra']]
In [335]:
d = {ni: indi for indi, ni in enumerate(set(data['caracter']))}
data['caracter'] = [d[ni] for ni in data['caracter']]
In [ ]:
d = {ni: indi for indi, ni in enumerate(set(predict['caracter']))}
predict['caracter'] = [d[ni] for ni in predict['caracter']]
In [121]:
caracter = data[~data['caracter'].isnull()][['usuario', 'caracter','tiempoCaracter','falloCaracter']]
caracter['user'] = data['usuarioID']
caracter = caracter.groupby(['usuario','caracter']).mean()
targerCaracter = caracter['user']
caracter = caracter.drop(['user'], axis=1)
#caracter.iloc[0:3]
caracter
Out[121]:
In [122]:
caracterPred = predict[~predict['caracter'].isnull()][['usuario', 'caracter','tiempoCaracter','falloCaracter']]
caracterPred['user'] = predict['usuarioID']
caracterPred = caracterPred.groupby(['usuario','caracter']).mean()
targerCaracterPred = caracterPred['user']
caracterPred = caracterPred.drop(['user'], axis=1)
#caracterPred.iloc[0:3]
caracterPred
Out[122]:
In [342]:
Enter = data[data['caracter'].isnull()][['usuario','tiempoCaracter']]
Enter.columns = ['usuario', 'tiempoEnter']
Enter = Enter.groupby(['usuario']).mean()
Enter
Out[342]:
In [343]:
usPalTiempo = data[data['caracter'].isnull()][['usuario', 'palabra', 'tiempoPalabra', 'tiempoErrPalabra','tamPalabra']]
usPalTiempo
Out[343]:
In [ ]:
usPalTiempoPred = predict[predict['caracter'].isnull()][['usuario', 'palabra', 'tiempoPalabra', 'tiempoErrPalabra','tamPalabra']]
usPalTiempoPred
In [370]:
falloCaracterPorPalabra = data.groupby(['usuario','palabra'])['falloCaracter'].sum()
falloCaracterPorPalabra
Out[370]:
In [ ]:
falloCaracterPorPalabraPred = predict.groupby(['usuario','palabra'])['falloCaracter'].sum()
falloCaracterPorPalabraPred
In [366]:
tiempoCoreccionCaracter = data[data['falloCaracter'] > 0].groupby(['usuario','palabra'])['tiempoCaracter'].sum()
tiempoCoreccionCaracter
Out[366]:
In [372]:
dataFallo = data[data['tiempoErrPalabra'] > 0]
dataFallo[dataFallo['palabra'] == "PZKOFTLILILILI"]
Out[372]:
In [377]:
In [345]:
tiempoMedioPalabra = usPalTiempo.drop(['tamPalabra'], axis=1)
tiempoMedioPalabra['user'] = data['usuarioID']
#usPalTiempo2['numPalabra'] = usPalTiempo['palabra']
tiempoMedioPalabra = tiempoMedioPalabra.groupby(['usuario','palabra']).mean()
tiempoMedioPalabra['falloCaracterPorPalabra'] = falloCaracterPorPalabra
targetTM = tiempoMedioPalabra['user']
tiempoMedioPalabra = tiempoMedioPalabra.drop(['user'], axis=1)
tiempoMedioPalabra
Out[345]:
In [ ]:
tiempoMedioPalabraPred = usPalTiempoPred.drop(['tamPalabra'], axis=1)
tiempoMedioPalabraPred['user'] = predict['usuarioID']
#usPalTiempo2['numPalabra'] = usPalTiempo['palabra']
tiempoMedioPalabraPred = tiempoMedioPalabraPred.groupby(['usuario','palabra']).mean()
tiempoMedioPalabraPred['falloCaracterPorPalabra'] = falloCaracterPorPalabraPred
targetTM = tiempoMedioPalabraPred['user']
tiempoMedioPalabraPred = tiempoMedioPalabraPred.drop(['user'], axis=1)
tiempoMedioPalabraPred
In [226]:
usPalTiempo3 = usPalTiempo.drop(['palabra'], axis=1)
targetUS = usPalTiempo3['usuario']
usPalTiempo3 = usPalTiempo3.groupby(['usuario']).mean()
#usPalTiempo3['tiempoMedioCaracter'] = usPalTiempo3['tiempoPalabra'] / usPalTiempo3['tamPalabra']
usPalTiempo3
Out[226]:
In [67]:
usPalTiempo3['tiempoEnter'] = Enter
usPalTiempo3
Out[67]:
In [227]:
data
Out[227]:
In [87]:
target = data['usuarioID']
target
Out[87]:
In [ ]:
targetPred = predict['usuarioID']
targetPred
In [303]:
data = data.drop(['usuario','palabraLeida','numPalabra', 'tamPalabra','usuarioID'], axis=1)
In [ ]:
predict = predict.drop(['usuario','palabraLeida','numPalabra', 'tamPalabra','usuarioID'], axis=1)
In [304]:
#'palabra', (mirar estos) 'falloCaracter' 'palabraCorrecta', 'hayErrPalabra'
data
Out[304]:
In [211]:
tiempoPorPalabra = data[data['tiempoErrPalabra'] > 0][['palabra','tiempoPalabra', 'tiempoErrPalabra', 'palabraCorrecta']]
tiempoPorPalabra
Out[211]:
In [161]:
#data['tiempoPalabra'] = [tiempoPorPalabra['tiempoPalabra'] for tiempoPorPalabra['tiempoPalabra'] in data['tiempoPalabra']]
data2 = data.copy()
In [163]:
data2 = data2.drop(['tiempoPalabra', 'tiempoErrPalabra'], axis=1)
In [144]:
#data2["tiempoPalabra"] = data2["palabra"].map(tiempoPorPalabra)
In [165]:
data2
Out[165]:
In [164]:
data
Out[164]:
In [305]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=101)
scores = cross_val_score(random_forest, data, target, cv=5)
print(scores)
print(scores.mean())
In [306]:
from sklearn.model_selection import cross_val_score
from sklearn import svm
svm = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(svm, data, target, cv=5)
print(scores)
print(scores.mean())
datos originales
In [123]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
ada = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(ada, data, target, cv=5)
print(scores)
print(scores.mean())
In [124]:
scores = cross_val_score(ada, tiempoMedioPalabra, targetTM, cv=5)
print(scores)
print(scores.mean())
In [125]:
scores = cross_val_score(ada, caracter, targerCaracter, cv=5)
print(scores)
print(scores.mean())
In [126]:
# no se si estaria bien asi ya que caracter tiene el usuario
ada.fit(caracter,targerCaracter)
Out[126]:
In [133]:
pred = ada.predict(caracterPred)
pred
Out[133]:
In [134]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(targerCaracterPred, pred)
print(accuracy)
In [128]:
score = ada.score(caracter, caracterPred)
score
In [130]:
caracter.describe()
Out[130]:
In [131]:
caracterPred
Out[131]:
In [132]:
caracter
Out[132]: