Logistic Regression Demo

Shown the basic case and treatments for special cases:

  • imbalanced datasets
  • too many parameters (regularization)

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation
from sklearn import datasets

In [6]:
# function to get data samples
def get_dataset(N_datapoints = 100000, class_ratio=0.5):
  num_observations_a = int(class_ratio * N_datapoints)
  num_observations_b = int((1 - class_ratio) * N_datapoints)
  np.random.seed(12)
  feature1 = np.random.multivariate_normal([0, 0], [[1, .75],[.75, 1]], num_observations_a)
  feature2 = np.random.multivariate_normal([1, 1], [[1, .75],[.75, 1]], num_observations_b)
  X = np.vstack((feature1, feature2)).astype(np.float32)
  y = np.hstack((np.zeros(num_observations_a), np.ones(num_observations_b)))
  y = np.reshape(y, (len(y), 1))
  data = np.concatenate((X, y), axis=1)
  df = pd.DataFrame(data, columns =['x1', 'x2', 'label'])
  return df

In [11]:
def get_dataset(N_datapoints = 100000, class_ratio=0.5, N_features_noise=0):
    num_observations_a = int(class_ratio * N_datapoints)
    num_observations_b = int((1 - class_ratio) * N_datapoints)
    np.random.seed(12)

    features = []
    feature1 = np.random.multivariate_normal([0, 0], [[1, .75],[.75, 1]], num_observations_a)
    feature2 = np.random.multivariate_normal([1, 1], [[1, .75],[.75, 1]], num_observations_b)
    # noise features of n datapoints
    features_noise = []
    num_observations_noise = 0
    noise_features = []
    for i in range(N_features_noise):
        num_observations_noise += N_datapoints
        noise_features.append(np.random.choice([0, 1], size=(num_observations_noise), p=[0.5, 0.5]))
        features_noise.append(np.random.multivariate_normal([0, 0], [[1, 0.],[0., 1]], N_datapoints))

    # collect all the features 
    features.extend([feature1])
    features.extend([feature2])
    features.extend(features_noise)

    X = np.vstack(features).astype(np.float32)
    y = np.hstack((
                 np.zeros(num_observations_a), 
                 np.ones(num_observations_b), 
                 noise_features
#                 np.random.choice([0, 1], size=(num_observations_noise), p=[0.5, 0.5])
                ))
    y = np.reshape(y, (len(y), 1))
    data = np.concatenate((X, y), axis=1)
    col_names = []
    col_names.extend(['x1'])
    col_names.extend(['x2'])
    for i in range(N_features_noise):
        col_names.extend(['x_noise_'+str(i)])
    col_names.extend(['label'])  
    print col_names
    print data.shape
    
    df = pd.DataFrame(data, columns=col_names)
    return df

In [12]:
df = get_dataset(N_datapoints = 100000, class_ratio=0.5, N_features_noise=10)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-12-5a88127ddadd> in <module>()
----> 1 df = get_dataset(N_datapoints = 100000, class_ratio=0.5, N_features_noise=10)

<ipython-input-11-b5a835e5a461> in get_dataset(N_datapoints, class_ratio, N_features_noise)
     29                 ))
     30     y = np.reshape(y, (len(y), 1))
---> 31     data = np.concatenate((X, y), axis=1)
     32     col_names = []
     33     col_names.extend(['x1'])

ValueError: all the input array dimensions except for the concatenation axis must match exactly

Scenario 1) Basic Case


In [5]:
# Not imbalanaced datasets (both classes same number of rows)
df = get_dataset(class_ratio=0.5)
fig, ax = plt.subplots()
plt.scatter(df['x1'].values, df['x2'].values, c=df['label'].values, alpha = .2)
display(fig)



In [6]:
predicted = cross_validation.cross_val_predict(LogisticRegression(), df[['x1', 'x2']], df['label'], cv=10)
print metrics.accuracy_score(df['label'], predicted)


0.70376

Scenario 2) Imbalanced Dataset


In [20]:
# Very imbalanced dataset (e.g. a study of fraud data)
df = get_dataset(class_ratio=0.98)
fig, ax = plt.subplots()
plt.scatter(df['x1'].values,df['x2'].values, c=df['label'].values, alpha = .2)
display(fig)


=> without any correction


In [21]:
predicted = cross_validation.cross_val_predict(LogisticRegression(), df[['x1', 'x2']], df['label'], cv=10)
print metrics.accuracy_score(df['label'], predicted)


0.98

!!! so 98% precision... as the input data..

=> with correction


In [22]:
# we correct for the imbalanced using the argument class_weight='balanced'
predicted = cross_validation.cross_val_predict(LogisticRegression(class_weight ='balanced'), df[['x1', 'x2']], df['label'], cv=10)
print metrics.accuracy_score(df['label'], predicted)


0.7086

Scenario 3) Too many (unrelated) features


In [56]:
df = get_dataset(N_datapoints = 100000, class_ratio=0.5, N_features_noise=10)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-56-5a88127ddadd> in <module>()
----> 1 df = get_dataset(N_datapoints = 100000, class_ratio=0.5, N_features_noise=10)

<ipython-input-53-e65d77b2c413> in get_dataset(N_datapoints, class_ratio, N_features_noise)
     33         col_names.extend('x_noise_'+str(i))
     34     col_names.extend('label')
---> 35     df = pd.DataFrame(data, columns =col_names)
     36     return df

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    295             else:
    296                 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 297                                          copy=copy)
    298         elif isinstance(data, (list, types.GeneratorType)):
    299             if isinstance(data, types.GeneratorType):

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_ndarray(self, values, index, columns, dtype, copy)
    472             values = _possibly_infer_to_datetimelike(values)
    473 
--> 474         return create_block_manager_from_blocks([values], [columns, index])
    475 
    476     @property

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in create_block_manager_from_blocks(blocks, axes)
   4254         blocks = [getattr(b, 'values', b) for b in blocks]
   4255         tot_items = sum(b.shape[0] for b in blocks)
-> 4256         construction_error(tot_items, blocks[0].shape[1:], axes, e)
   4257 
   4258 

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in construction_error(tot_items, block_shape, axes, e)
   4231         raise ValueError("Empty data passed with indices specified.")
   4232     raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 4233         passed, implied))
   4234 
   4235 

ValueError: Shape of passed values is (3, 1100000), indices imply (99, 1100000)

In [55]:
df_pl =df.sample(frac=0.01)
fig, ax = plt.subplots(1, 3, figsize=(16, 4))
df_pl.plot.scatter('x1', 'x2',c='label', ax=ax[0])
df_pl.plot.scatter('x1', 'x_noise_1',c='label', ax=ax[1])
df_pl.plot.scatter('x_noise_1', 'x_noise_2',c='label', ax=ax[2])


# ax[0].scatter(df_pl['x1'].values,df['x2'].values, c=df_pl['label'].values, alpha = .2)
# ax[1].scatter(df_pl['x1'].values,df['x3'].values, c=df_pl['label'].values, alpha = .2)
# ax[2].scatter(df_pl['x1'].values,df['x3'].values, c=df_pl['label'].values, alpha = .2)
display(fig)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-55-46fb07642696> in <module>()
      2 fig, ax = plt.subplots(1, 3, figsize=(16, 4))
      3 df_pl.plot.scatter('x1', 'x2',c='label', ax=ax[0])
----> 4 df_pl.plot.scatter('x1', 'x_noise_1',c='label', ax=ax[1])
      5 df_pl.plot.scatter('x_noise_1', 'x_noise_2',c='label', ax=ax[2])
      6 

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.pyc in scatter(self, x, y, s, c, **kwds)
   3950         axes : matplotlib.AxesSubplot or np.array of them
   3951         """
-> 3952         return self(kind='scatter', x=x, y=y, c=c, s=s, **kwds)
   3953 
   3954     def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None,

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.pyc in __call__(self, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   3772                           fontsize=fontsize, colormap=colormap, table=table,
   3773                           yerr=yerr, xerr=xerr, secondary_y=secondary_y,
-> 3774                           sort_columns=sort_columns, **kwds)
   3775     __call__.__doc__ = plot_frame.__doc__
   3776 

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.pyc in plot_frame(data, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   2641                  yerr=yerr, xerr=xerr,
   2642                  secondary_y=secondary_y, sort_columns=sort_columns,
-> 2643                  **kwds)
   2644 
   2645 

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.pyc in _plot(data, x, y, subplots, ax, kind, **kwds)
   2468         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   2469 
-> 2470     plot_obj.generate()
   2471     plot_obj.draw()
   2472     return plot_obj.result

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.pyc in generate(self)
   1041         self._compute_plot_data()
   1042         self._setup_subplots()
-> 1043         self._make_plot()
   1044         self._add_table()
   1045         self._make_legend()

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.pyc in _make_plot(self)
   1617         else:
   1618             label = None
-> 1619         scatter = ax.scatter(data[x].values, data[y].values, c=c_values,
   1620                              label=label, cmap=cmap, **self.kwds)
   1621         if cb:

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3541 
   3542             if not isnull(item):
-> 3543                 loc = self.items.get_loc(item)
   3544             else:
   3545                 indexer = np.arange(len(self.items))[isnull(self.items)]

/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'x_noise_1'

In [ ]: