Análisis aplicación climMAPcore

Mediante el uso del lenguaje de programación Python se corroborarra la información que genera la aplicación climMAPcore para el Estado de Aguascalientes

Regresión Lineal



In [2]:

    
# librerias
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression



In [5]:

    
# leer el csv
data = pd.read_csv('../../data/db_join_wrf_tpro_10_25_tmid_10_20.csv')



In [6]:

    
# estructura del dataFrame
data.head()









    Out[6]:







  
    
      
      lat
      long
      problem
      incidencia
      anio
      mes
      dia
      ciclo
      prec
      tmax
      ...
      tpro
      velv
      dirv
      humr
      dpoint
      tmidnight
      condiciones
      tipo
      indicePresencia
      porcentajePresencia
    
  
  
    
      0
      27.344400
      -109.934357
      Roya Lineal
      0.01
      2014
      3
      17
      Ciclo 2013-2014
      0.0
      28.873
      ...
      18.8640
      6.24163
      309.96953
      30.4142
      1.033681
      18.136
      0
      N
      3.0
      0.75
    
    
      1
      27.263716
      -109.847138
      Roya Lineal
      0.01
      2014
      2
      20
      Ciclo 2013-2014
      0.0
      27.786
      ...
      20.0520
      2.30557
      37.89133
      46.7154
      8.250035
      13.019
      1
      N
      4.0
      1.00
    
    
      2
      27.225971
      -109.855972
      Roya Lineal
      0.01
      2014
      2
      27
      Ciclo 2013-2014
      0.0
      27.727
      ...
      19.0616
      6.98013
      312.18672
      59.7521
      11.009703
      14.826
      1
      N
      4.0
      1.00
    
    
      3
      27.198668
      -109.890648
      Roya Lineal
      0.04
      2014
      2
      4
      Ciclo 2013-2014
      0.0
      21.319
      ...
      14.0539
      5.99438
      340.75799
      62.9675
      7.051250
      13.144
      1
      N
      2.0
      0.50
    
    
      4
      27.474041
      -110.116669
      Roya Lineal
      0.01
      2014
      3
      4
      Ciclo 2013-2014
      0.0
      27.244
      ...
      18.7576
      6.22889
      273.46531
      63.5817
      11.664811
      14.686
      1
      N
      4.0
      1.00
    
  

5 rows × 21 columns



In [9]:

    
# columnas del dataframe
data.columns









    Out[9]:





Index(['lat', 'long', 'problem', 'incidencia', 'anio', 'mes', 'dia', 'ciclo',
       'prec', 'tmax', 'tmin', 'tpro', 'velv', 'dirv', 'humr', 'dpoint',
       'tmidnight', 'condiciones', 'tipo', 'indicePresencia',
       'porcentajePresencia'],
      dtype='object')



In [10]:

    
# información del dataFrame
data.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 743 entries, 0 to 742
Data columns (total 21 columns):
lat                    743 non-null float64
long                   743 non-null float64
problem                743 non-null object
incidencia             743 non-null float64
anio                   743 non-null int64
mes                    743 non-null int64
dia                    743 non-null int64
ciclo                  743 non-null object
prec                   743 non-null float64
tmax                   743 non-null float64
tmin                   743 non-null float64
tpro                   743 non-null float64
velv                   743 non-null float64
dirv                   743 non-null float64
humr                   743 non-null float64
dpoint                 743 non-null float64
tmidnight              743 non-null float64
condiciones            743 non-null int64
tipo                   743 non-null object
indicePresencia        743 non-null float64
porcentajePresencia    743 non-null float64
dtypes: float64(14), int64(4), object(3)
memory usage: 122.0+ KB



In [15]:

    
# utilizar solo las columnas con datos 
data = data[['incidencia', 'prec', 'tmax', 'tmin', 'tpro', 'humr', 'dpoint','tmidnight','indicePresencia','porcentajePresencia']]



In [17]:

    
# checar estructura del dataFrame
data.head()









    Out[17]:







  
    
      
      incidencia
      prec
      tmax
      tmin
      tpro
      humr
      dpoint
      tmidnight
      indicePresencia
      porcentajePresencia
    
  
  
    
      0
      0.01
      0.0
      28.873
      10.737
      18.8640
      30.4142
      1.033681
      18.136
      3.0
      0.75
    
    
      1
      0.01
      0.0
      27.786
      14.767
      20.0520
      46.7154
      8.250035
      13.019
      4.0
      1.00
    
    
      2
      0.01
      0.0
      27.727
      12.901
      19.0616
      59.7521
      11.009703
      14.826
      4.0
      1.00
    
    
      3
      0.04
      0.0
      21.319
      8.175
      14.0539
      62.9675
      7.051250
      13.144
      2.0
      0.50
    
    
      4
      0.01
      0.0
      27.244
      12.558
      18.7576
      63.5817
      11.664811
      14.686
      4.0
      1.00



In [18]:

    
rl = LinearRegression()
rl.fit(data, data['incidencia'])









    Out[18]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [19]:

    
# lista de coeficientes B para cada X
list(zip(data.columns, rl.coef_))









    Out[19]:





[('incidencia', 0.99999999999999933),
 ('prec', 1.4195794116328702e-16),
 ('tmax', 3.6645560094951169e-15),
 ('tmin', -3.7081788180924425e-15),
 ('tpro', -2.9153180745755238e-17),
 ('humr', -5.0822663501291872e-17),
 ('dpoint', 1.8689240692079128e-16),
 ('tmidnight', -3.7417229114604591e-15),
 ('indicePresencia', -8.7861408931829768e-17),
 ('porcentajePresencia', -2.4652326877976767e-17)]



In [21]:

    
# generando las predicciones
predicciones = rl.predict(data)
predicciones_df = pd.DataFrame(predicciones, columns=['Pred'])
# predicciones de las primeras 5 líneas
predicciones_df.head()



In [22]:

    
# calculando el desvio
np.mean(data['incidencia'] - predicciones)









    Out[22]:





7.111665824798592e-18

Regresión Logística



In [33]:

    
# creando un dataset de ejemplo
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4)



In [34]:

    
# Importando el modelo
from sklearn.linear_model import LogisticRegression

rlog = LogisticRegression() # Creando el modelo



In [35]:

    
# Dividiendo el dataset en entrenamiento y evaluacion
X_entrenamiento = X[:-200]
X_evaluacion = X[-200:]
y_entrenamiento = y[:-200]
y_evaluacion = y[-200:]
rlog.fit(X_entrenamiento, y_entrenamiento) #ajustando el modelo









    Out[35]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [36]:

    
# Realizando las predicciones
y_predic_entrenamiento = rlog.predict(X_entrenamiento) 
y_predic_evaluacion = rlog.predict(X_evaluacion)



In [37]:

    
# Verificando la exactitud del modelo
entrenamiento = (y_predic_entrenamiento == y_entrenamiento).sum().astype(float) / y_entrenamiento.shape[0]
print("sobre datos de entrenamiento: {0:.2f}".format(entrenamiento))
evaluacion = (y_predic_evaluacion == y_evaluacion).sum().astype(float) / y_evaluacion.shape[0]
print("sobre datos de evaluación: {0:.2f}".format(evaluacion))









    



sobre datos de entrenamiento: 0.92
sobre datos de evaluación: 0.92

Arboles de decisión



In [47]:

    
# Creando un dataset de ejemplo
X, y = make_classification(1000, 20, n_informative=3)



In [48]:

    
# Importando el arbol de decisión
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree



In [49]:

    
ad = DecisionTreeClassifier(criterion='entropy', max_depth=3) # Creando el modelo
ad.fit(X, y) # Ajustando el modelo









    Out[49]:





DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')



In [50]:

    
#generando archivo para graficar el arbol
with open("mi_arbol.dot", 'w') as archivo_dot:
    tree.export_graphviz(ad, out_file = archivo_dot)



In [51]:

    
# utilizando el lenguaje dot para graficar el arbol.
!dot -Tjpeg mi_arbol.dot -o arbol_decision.jpeg



In [56]:

    
# verificando la precisión
print("precisión del modelo: {0: .2f}".format((y == ad.predict(X)).mean()))









    



precisión del modelo:  0.84

Random Forest



In [60]:

    
# Creando un dataset de ejemplo
X, y = make_classification(1000)



In [61]:

    
# Importando el random forest
from sklearn.ensemble import RandomForestClassifier



In [62]:

    
rf = RandomForestClassifier() # Creando el modelo
rf.fit(X, y) # Ajustando el modelo









    Out[62]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [63]:

    
# verificando la precisión
print("precisión del modelo: {0: .2f}".format((y == rf.predict(X)).mean()))









    



precisión del modelo:  1.00

validación climMAPcore



In [88]:

    
# determinar los valores
X = data.iloc[:, 1:]
y = data['incidencia']



In [89]:

    
# Importando el random forest
from sklearn.ensemble import RandomForestClassifier



In [94]:

    
rf = RandomForestClassifier() # Creando el modelo
rf.fit(X,y)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-94-6209c0fe50f9> in <module>()
      1 rf = RandomForestClassifier() # Creando el modelo
----> 2 rf.fit(X,y)

~/anaconda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
    270         self.n_outputs_ = y.shape[1]
    271 
--> 272         y, expanded_class_weight = self._validate_y_class_weight(y)
    273 
    274         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:

~/anaconda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _validate_y_class_weight(self, y)
    467 
    468     def _validate_y_class_weight(self, y):
--> 469         check_classification_targets(y)
    470 
    471         y = np.copy(y)

~/anaconda/lib/python3.6/site-packages/sklearn/utils/multiclass.py in check_classification_targets(y)
    170     if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
    171                       'multilabel-indicator', 'multilabel-sequences']:
--> 172         raise ValueError("Unknown label type: %r" % y_type)
    173 
    174 

ValueError: Unknown label type: 'continuous'



In [93]:

    
# verificando la precisión
print("precisión del modelo: {0: .2f}".format((y == rf.predict(X)).mean()))









    



---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
<ipython-input-93-133db9c332bc> in <module>()
      1 # verificando la precisión
----> 2 print("precisión del modelo: {0: .2f}".format((y == rf.predict(X)).mean()))

~/anaconda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in predict(self, X)
    534             The predicted classes.
    535         """
--> 536         proba = self.predict_proba(X)
    537 
    538         if self.n_outputs_ == 1:

~/anaconda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in predict_proba(self, X)
    572             classes corresponds to that in the attribute `classes_`.
    573         """
--> 574         check_is_fitted(self, 'estimators_')
    575         # Check data
    576         X = self._validate_X_predict(X)

~/anaconda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_is_fitted(estimator, attributes, msg, all_or_any)
    735 
    736     if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
--> 737         raise NotFittedError(msg % {'name': type(estimator).__name__})
    738 
    739 

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

SVM o Máquinas de vectores de soporte



In [81]:

    
# importanto SVM
from sklearn import svm
from sklearn.svm import SVC



In [82]:

    
# determinar los valores a utilizar
data.columns









    Out[82]:





Index(['incidencia', 'prec', 'tmax', 'tmin', 'tpro', 'humr', 'dpoint',
       'tmidnight', 'indicePresencia', 'porcentajePresencia'],
      dtype='object')



In [83]:

    
# determinar la información
X = data.iloc[:, 1:]
y = data['incidencia']



In [84]:

    
# tamaño de la malla del gráfico
h = 0.02



In [86]:

    
# creando el SVM con sus diferentes métodos
C = 1.0  # parametro de regulacion SVM 
svc = svm.SVC(kernel='linear', C=C).fit(X, y)
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)
lin_svc = svm.LinearSVC(C=C).fit(X, y)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-86-4cbb6696e2cb> in <module>()
      1 # creando el SVM con sus diferentes métodos
      2 C = 1.0  # parametro de regulacion SVM
----> 3 svc = svm.SVC(kernel='linear', C=C).fit(X, y)
      4 rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)
      5 poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)

~/anaconda/lib/python3.6/site-packages/sklearn/svm/base.py in fit(self, X, y, sample_weight)
    148 
    149         X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
--> 150         y = self._validate_targets(y)
    151 
    152         sample_weight = np.asarray([]

~/anaconda/lib/python3.6/site-packages/sklearn/svm/base.py in _validate_targets(self, y)
    498     def _validate_targets(self, y):
    499         y_ = column_or_1d(y, warn=True)
--> 500         check_classification_targets(y)
    501         cls, y = np.unique(y_, return_inverse=True)
    502         self.class_weight_ = compute_class_weight(self.class_weight, cls, y_)

~/anaconda/lib/python3.6/site-packages/sklearn/utils/multiclass.py in check_classification_targets(y)
    170     if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
    171                       'multilabel-indicator', 'multilabel-sequences']:
--> 172         raise ValueError("Unknown label type: %r" % y_type)
    173 
    174 

ValueError: Unknown label type: 'continuous'



In [ ]:

	lat	long	problem	incidencia	anio	mes	dia	ciclo	tmax	...	tpro	velv	dirv	humr	dpoint	tmidnight	condiciones	tipo	indicePresencia	porcentajePresencia
0	27.344400	-109.934357	Roya Lineal	0.01	2014	3	17	Ciclo 2013-2014	28.873	...	18.8640	6.24163	309.96953	30.4142	1.033681	18.136	0	N	3.0	0.75
1	27.263716	-109.847138	Roya Lineal	0.01	2014	2	20	Ciclo 2013-2014	27.786	...	20.0520	2.30557	37.89133	46.7154	8.250035	13.019	1	N	4.0	1.00
2	27.225971	-109.855972	Roya Lineal	0.01	2014	2	27	Ciclo 2013-2014	27.727	...	19.0616	6.98013	312.18672	59.7521	11.009703	14.826	1	N	4.0	1.00
3	27.198668	-109.890648	Roya Lineal	0.04	2014	2	4	Ciclo 2013-2014	21.319	...	14.0539	5.99438	340.75799	62.9675	7.051250	13.144	1	N	2.0	0.50
4	27.474041	-110.116669	Roya Lineal	0.01	2014	3	4	Ciclo 2013-2014	27.244	...	18.7576	6.22889	273.46531	63.5817	11.664811	14.686	1	N	4.0	1.00

	incidencia	tmax	tmin	tpro	humr	dpoint	tmidnight	indicePresencia	porcentajePresencia
0	0.01	28.873	10.737	18.8640	30.4142	1.033681	18.136	3.0	0.75
1	0.01	27.786	14.767	20.0520	46.7154	8.250035	13.019	4.0	1.00
2	0.01	27.727	12.901	19.0616	59.7521	11.009703	14.826	4.0	1.00
3	0.04	21.319	8.175	14.0539	62.9675	7.051250	13.144	2.0	0.50
4	0.01	27.244	12.558	18.7576	63.5817	11.664811	14.686	4.0	1.00