In [1]:
import numpy as np
import pandas as pd
In [2]:
df = pd.read_csv('../tests/bugs/issue_22/SCIKIT_MODEL_FIT_FAILURE.csv.gz')
In [3]:
df.head()
Out[3]:
Unnamed: 0
0
1
2
3
4
5
6
7
8
...
904
905
906
907
908
909
910
911
912
TGT
0
0
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
...
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
-141279.015625
1
1
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
...
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
-141276.625000
2
2
-141276.625000
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
...
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
-141272.812500
3
3
-141272.812500
-141276.625000
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
...
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
-141267.812500
4
4
-141267.812500
-141272.812500
-141276.625000
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
...
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
-141261.218750
5 rows × 915 columns
In [4]:
df.describe()
Out[4]:
Unnamed: 0
0
1
2
3
4
5
6
7
8
...
904
905
906
907
908
909
910
911
912
TGT
count
800.0000
800.000000
800.000000
800.000000
800.000000
800.000000
800.000000
800.000000
800.000000
800.000000
...
800.000000
800.000000
800.000000
800.000000
800.000000
800.000000
800.000000
800.000000
800.000000
800.000000
mean
399.5000
-528.537519
-1055.755781
-1581.656035
-2106.239765
-2629.508769
-3151.464257
-3672.107988
-4191.441992
-4709.467754
...
0.030000
0.955000
0.045000
0.978750
0.021250
0.977500
0.022500
0.981250
0.018750
0.000743
std
231.0844
125987.157626
125691.838195
125396.566851
125101.346122
124806.177829
124511.065101
124216.009835
123921.013305
123626.078004
...
0.170694
0.207434
0.207434
0.144307
0.144307
0.148396
0.148396
0.135726
0.135726
126282.523743
min
0.0000
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
-141279.015625
...
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
-141279.015625
25%
199.7500
-114838.421875
-115102.021484
-115364.121094
-115624.972656
-115884.519531
-116142.968750
-116399.972656
-116655.470703
-116909.720703
...
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
-114573.373047
50%
399.5000
-35728.421875
-36255.621094
-36781.419922
-37305.919922
-37829.220703
-38351.320312
-38871.919922
-39391.121094
-39909.021484
...
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
-35199.720703
75%
599.2500
96049.578125
95258.779297
94469.277344
93681.128906
92894.082031
92108.328125
91324.128906
90541.228516
89759.681641
...
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
1.000000
0.000000
96841.927734
max
799.0000
280495.593750
279441.187500
278387.968750
277336.187500
276285.375000
275235.968750
274188.187500
273141.593750
272096.375000
...
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
1.000000
281551.593750
8 rows × 915 columns
In [5]:
import sklearn.linear_model as linear_model
lScikitModel = linear_model.Ridge()
In [6]:
X = df[df.columns[1:-1]].values;
y = df[df.columns[-1]].values;
In [7]:
lScikitModel.fit(X , y)
/usr/lib/python3/dist-packages/sklearn/linear_model/ridge.py:154: UserWarning: Singular matrix in solving dual problem. Using least-squares solution instead.
warnings.warn("Singular matrix in solving dual problem. Using "
Out[7]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, random_state=None, solver='auto', tol=0.001)
In [8]:
# change the solver ... force svd => no warning
# accodrding to this doc :
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html
lScikitModel2 = linear_model.Ridge(solver='svd')
lScikitModel2.fit(X , y)
Out[8]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, random_state=None, solver='svd', tol=0.001)
In [9]:
np.corrcoef(X)
Out[9]:
array([[ 1. , 1. , 1. , ..., -0.9025477 ,
-0.90322368, -0.90389237],
[ 1. , 1. , 1. , ..., -0.9025477 ,
-0.90322368, -0.90389237],
[ 1. , 1. , 1. , ..., -0.90254764,
-0.90322361, -0.90389231],
...,
[-0.9025477 , -0.9025477 , -0.90254764, ..., 1. ,
0.99999876, 0.99999509],
[-0.90322368, -0.90322368, -0.90322361, ..., 0.99999876,
1. , 0.99999878],
[-0.90389237, -0.90389237, -0.90389231, ..., 0.99999509,
0.99999878, 1. ]])
In [ ]:
Content source: antoinecarme/pyaf
Similar notebooks: