In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../tests/bugs/issue_22/SCIKIT_MODEL_FIT_FAILURE.csv.gz')

In [3]:
df.head()


Out[3]:
Unnamed: 0 0 1 2 3 4 5 6 7 8 ... 904 905 906 907 908 909 910 911 912 TGT
0 0 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 ... 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 -141279.015625
1 1 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 ... 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 -141276.625000
2 2 -141276.625000 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 ... 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 -141272.812500
3 3 -141272.812500 -141276.625000 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 ... 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 -141267.812500
4 4 -141267.812500 -141272.812500 -141276.625000 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 ... 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 -141261.218750

5 rows × 915 columns


In [4]:
df.describe()


Out[4]:
Unnamed: 0 0 1 2 3 4 5 6 7 8 ... 904 905 906 907 908 909 910 911 912 TGT
count 800.0000 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000 ... 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000
mean 399.5000 -528.537519 -1055.755781 -1581.656035 -2106.239765 -2629.508769 -3151.464257 -3672.107988 -4191.441992 -4709.467754 ... 0.030000 0.955000 0.045000 0.978750 0.021250 0.977500 0.022500 0.981250 0.018750 0.000743
std 231.0844 125987.157626 125691.838195 125396.566851 125101.346122 124806.177829 124511.065101 124216.009835 123921.013305 123626.078004 ... 0.170694 0.207434 0.207434 0.144307 0.144307 0.148396 0.148396 0.135726 0.135726 126282.523743
min 0.0000 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 -141279.015625 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -141279.015625
25% 199.7500 -114838.421875 -115102.021484 -115364.121094 -115624.972656 -115884.519531 -116142.968750 -116399.972656 -116655.470703 -116909.720703 ... 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 -114573.373047
50% 399.5000 -35728.421875 -36255.621094 -36781.419922 -37305.919922 -37829.220703 -38351.320312 -38871.919922 -39391.121094 -39909.021484 ... 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 -35199.720703
75% 599.2500 96049.578125 95258.779297 94469.277344 93681.128906 92894.082031 92108.328125 91324.128906 90541.228516 89759.681641 ... 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 96841.927734
max 799.0000 280495.593750 279441.187500 278387.968750 277336.187500 276285.375000 275235.968750 274188.187500 273141.593750 272096.375000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 281551.593750

8 rows × 915 columns


In [5]:
import sklearn.linear_model as linear_model
lScikitModel = linear_model.Ridge()

In [6]:
X = df[df.columns[1:-1]].values;
y = df[df.columns[-1]].values;

In [7]:
lScikitModel.fit(X , y)


/usr/lib/python3/dist-packages/sklearn/linear_model/ridge.py:154: UserWarning: Singular matrix in solving dual problem. Using least-squares solution instead.
  warnings.warn("Singular matrix in solving dual problem. Using "
Out[7]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [8]:
# change the solver ... force svd => no warning
# accodrding to this doc : 
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

lScikitModel2 = linear_model.Ridge(solver='svd')
lScikitModel2.fit(X , y)


Out[8]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='svd', tol=0.001)

In [9]:
np.corrcoef(X)


Out[9]:
array([[ 1.        ,  1.        ,  1.        , ..., -0.9025477 ,
        -0.90322368, -0.90389237],
       [ 1.        ,  1.        ,  1.        , ..., -0.9025477 ,
        -0.90322368, -0.90389237],
       [ 1.        ,  1.        ,  1.        , ..., -0.90254764,
        -0.90322361, -0.90389231],
       ..., 
       [-0.9025477 , -0.9025477 , -0.90254764, ...,  1.        ,
         0.99999876,  0.99999509],
       [-0.90322368, -0.90322368, -0.90322361, ...,  0.99999876,
         1.        ,  0.99999878],
       [-0.90389237, -0.90389237, -0.90389231, ...,  0.99999509,
         0.99999878,  1.        ]])

In [ ]: