This script generates profiles of pandas dataframes using the Pandas-Profiling library


In [1]:
import pandas_profiling
import pandas as pd


/usr/local/lib/python3.5/dist-packages/matplotlib/__init__.py:1401: UserWarning:  This call to matplotlib.use() has no effect
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)

Generate profile just for the properties


In [2]:
props = pd.read_csv('../data/processed_properties.csv', 
                    header=0, index_col=0)
props = props.drop_duplicates()
props = props.dropna()
props.info()
props_profile = pandas_profiling.ProfileReport(props)
props_profile.to_file('props_profile.html')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1828 entries, 0 to 1827
Data columns (total 37 columns):
rows                  1828 non-null int64
cols                  1828 non-null int64
min_nnz_row           1828 non-null int64
row_var               1828 non-null float64
col_var               1828 non-null float64
diag_var              1828 non-null float64
nnz                   1828 non-null int64
frob_norm             1828 non-null float64
symm_frob_norm        1828 non-null float64
antisymm_frob_norm    1828 non-null float64
one_norm              1828 non-null float64
inf_norm              1828 non-null float64
symm_inf_norm         1828 non-null float64
antisymm_inf_norm     1828 non-null float64
max_nnz_row           1828 non-null int64
trace                 1828 non-null float64
abs_trace             1828 non-null float64
min_nnz_row.1         1828 non-null int64
avg_nnz_row           1828 non-null int64
dummy_rows            1828 non-null int64
dummy_rows_kind       1828 non-null int64
num_value_symm_1      1828 non-null int64
nnz_pattern_symm_1    1828 non-null int64
num_value_symm_2      1828 non-null float64
nnz_pattern_symm_2    1828 non-null float64
row_diag_dom          1828 non-null int64
col_diag_dom          1828 non-null int64
diag_avg              1828 non-null float64
diag_sign             1828 non-null int64
diag_nnz              1828 non-null int64
lower_bw              1828 non-null int64
upper_bw              1828 non-null int64
row_log_val_spread    1828 non-null float64
col_log_val_spread    1828 non-null float64
symm                  1828 non-null int64
matrix                1828 non-null object
matrix_id             1828 non-null int64
dtypes: float64(17), int64(19), object(1)
memory usage: 542.7+ KB
/usr/local/lib/python3.5/dist-packages/pandas_profiling/base.py:59: RuntimeWarning: overflow encountered in long_scalars
  stats['range'] = stats['max'] - stats['min']

Generate profiles for the individual systems+properties


In [3]:
comet = pd.read_csv('../data/comet/comet_unprocessed_timings.csv', 
                    header=0, index_col=0)
comet = comet.drop_duplicates()
comet = comet.dropna()
comet.info()
comet_merged = pd.merge(comet, props, on='matrix')
comet_profile = pandas_profiling.ProfileReport(comet_merged)
comet_profile.to_file('comet_unprocessed_timings_profile.html')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 96516 entries, 0 to 33593
Data columns (total 16 columns):
system         96516 non-null object
numprocs       96516 non-null int64
matrix         96516 non-null object
solver         96516 non-null object
prec           96516 non-null object
status         96516 non-null object
time           96516 non-null float64
iters          96516 non-null float64
resid          96516 non-null float64
system_id      96516 non-null int64
solver_id      96516 non-null int64
prec_id        96516 non-null int64
status_id      96516 non-null int64
good_or_bad    96516 non-null int64
new_time       96516 non-null float64
matrix_id      96516 non-null int64
dtypes: float64(4), int64(7), object(5)
memory usage: 12.5+ MB
/usr/local/lib/python3.5/dist-packages/pandas_profiling/base.py:59: RuntimeWarning: overflow encountered in long_scalars
  stats['range'] = stats['max'] - stats['min']
/usr/local/lib/python3.5/dist-packages/pandas_profiling/base.py:59: RuntimeWarning: overflow encountered in long_scalars
  stats['range'] = stats['max'] - stats['min']

In [4]:
janus = pd.read_csv('../data/janus/janus_unprocessed_timings.csv', 
                    header=0, index_col=0)
janus = janus.drop_duplicates()
janus = janus.dropna()
janus.info()
janus_merged = pd.merge(janus, props, on='matrix')
janus_profile = pandas_profiling.ProfileReport(janus_merged)
janus_profile.to_file('janus_unprocessed_timings_profile.html')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 124529 entries, 0 to 42688
Data columns (total 16 columns):
system         124529 non-null object
numprocs       124529 non-null int64
matrix         124529 non-null object
solver         124529 non-null object
prec           124529 non-null object
status         124529 non-null object
time           124529 non-null float64
iters          124529 non-null float64
resid          124529 non-null float64
system_id      124529 non-null int64
solver_id      124529 non-null int64
prec_id        124529 non-null int64
status_id      124529 non-null int64
good_or_bad    124529 non-null int64
new_time       124529 non-null float64
matrix_id      124529 non-null int64
dtypes: float64(4), int64(7), object(5)
memory usage: 16.2+ MB
/usr/local/lib/python3.5/dist-packages/pandas_profiling/base.py:59: RuntimeWarning: overflow encountered in long_scalars
  stats['range'] = stats['max'] - stats['min']
/usr/local/lib/python3.5/dist-packages/pandas_profiling/base.py:59: RuntimeWarning: overflow encountered in long_scalars
  stats['range'] = stats['max'] - stats['min']

In [5]:
bridges = pd.read_csv('../data/bridges/bridges_unprocessed_timings.csv', 
                    header=0, index_col=0)
bridges = bridges.drop_duplicates()
bridges = bridges.dropna()
bridges.info()
bridges_merged = pd.merge(bridges, props, on='matrix')
bridges_profile = pandas_profiling.ProfileReport(bridges_merged)
bridges_profile.to_file('bridges_unprocessed_timings_profile.html')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 114765 entries, 0 to 33513
Data columns (total 16 columns):
system         114765 non-null object
numprocs       114765 non-null int64
matrix         114765 non-null object
solver         114765 non-null object
prec           114765 non-null object
status         114765 non-null object
time           114765 non-null float64
iters          114765 non-null float64
resid          114765 non-null float64
system_id      114765 non-null int64
solver_id      114765 non-null int64
prec_id        114765 non-null int64
status_id      114765 non-null int64
good_or_bad    114765 non-null int64
new_time       114765 non-null float64
matrix_id      114765 non-null int64
dtypes: float64(4), int64(7), object(5)
memory usage: 14.9+ MB
/usr/local/lib/python3.5/dist-packages/pandas_profiling/base.py:59: RuntimeWarning: overflow encountered in long_scalars
  stats['range'] = stats['max'] - stats['min']
/usr/local/lib/python3.5/dist-packages/pandas_profiling/base.py:59: RuntimeWarning: overflow encountered in long_scalars
  stats['range'] = stats['max'] - stats['min']

Generate profiles for the combined times+properties


In [6]:
all_times = pd.concat([comet, bridges, janus], ignore_index=True)
all_times.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335810 entries, 0 to 335809
Data columns (total 16 columns):
system         335810 non-null object
numprocs       335810 non-null int64
matrix         335810 non-null object
solver         335810 non-null object
prec           335810 non-null object
status         335810 non-null object
time           335810 non-null float64
iters          335810 non-null float64
resid          335810 non-null float64
system_id      335810 non-null int64
solver_id      335810 non-null int64
prec_id        335810 non-null int64
status_id      335810 non-null int64
good_or_bad    335810 non-null int64
new_time       335810 non-null float64
matrix_id      335810 non-null int64
dtypes: float64(4), int64(7), object(5)
memory usage: 41.0+ MB

In [9]:
combined = pd.merge(props, all_times, on=['matrix','matrix_id'])
combined.info()
combined = combined.drop_duplicates()
combined = combined.dropna()
combined_profile = pandas_profiling.ProfileReport(combined)
combined_profile.to_file('unprocessed_combined_profile.html')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 152755 entries, 0 to 152754
Data columns (total 51 columns):
rows                  152755 non-null int64
cols                  152755 non-null int64
min_nnz_row           152755 non-null int64
row_var               152755 non-null float64
col_var               152755 non-null float64
diag_var              152755 non-null float64
nnz                   152755 non-null int64
frob_norm             152755 non-null float64
symm_frob_norm        152755 non-null float64
antisymm_frob_norm    152755 non-null float64
one_norm              152755 non-null float64
inf_norm              152755 non-null float64
symm_inf_norm         152755 non-null float64
antisymm_inf_norm     152755 non-null float64
max_nnz_row           152755 non-null int64
trace                 152755 non-null float64
abs_trace             152755 non-null float64
min_nnz_row.1         152755 non-null int64
avg_nnz_row           152755 non-null int64
dummy_rows            152755 non-null int64
dummy_rows_kind       152755 non-null int64
num_value_symm_1      152755 non-null int64
nnz_pattern_symm_1    152755 non-null int64
num_value_symm_2      152755 non-null float64
nnz_pattern_symm_2    152755 non-null float64
row_diag_dom          152755 non-null int64
col_diag_dom          152755 non-null int64
diag_avg              152755 non-null float64
diag_sign             152755 non-null int64
diag_nnz              152755 non-null int64
lower_bw              152755 non-null int64
upper_bw              152755 non-null int64
row_log_val_spread    152755 non-null float64
col_log_val_spread    152755 non-null float64
symm                  152755 non-null int64
matrix                152755 non-null object
matrix_id             152755 non-null int64
system                152755 non-null object
numprocs              152755 non-null int64
solver                152755 non-null object
prec                  152755 non-null object
status                152755 non-null object
time                  152755 non-null float64
iters                 152755 non-null float64
resid                 152755 non-null float64
system_id             152755 non-null int64
solver_id             152755 non-null int64
prec_id               152755 non-null int64
status_id             152755 non-null int64
good_or_bad           152755 non-null int64
new_time              152755 non-null float64
dtypes: float64(21), int64(25), object(5)
memory usage: 60.6+ MB
/usr/local/lib/python3.5/dist-packages/pandas_profiling/base.py:59: RuntimeWarning: overflow encountered in long_scalars
  stats['range'] = stats['max'] - stats['min']

In [24]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

combined_new = combined.drop(['matrix', 'solver', 'prec', 
                              'status', 'system'], axis=1)
combined_new = combined_new.dropna()

X = combined_new.iloc[:,:-2]
y = combined_new.iloc[:, -1]

clf = RandomForestClassifier()
clf.fit(X, y)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-24-829494bcf779> in <module>()
     14 
     15 clf = RandomForestClassifier()
---> 16 clf.fit(X, y)

/usr/local/lib/python3.5/dist-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
    245         """
    246         # Validate or convert input data
--> 247         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
    248         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
    249         if issparse(X):

/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    405                              % (array.ndim, estimator_name))
    406         if force_all_finite:
--> 407             _assert_all_finite(array)
    408 
    409     shape_repr = _shape_repr(array.shape)

/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py in _assert_all_finite(X)
     56             and not np.isfinite(X).all()):
     57         raise ValueError("Input contains NaN, infinity"
---> 58                          " or a value too large for %r." % X.dtype)
     59 
     60 

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').