In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [22]:
import os
import urllib
root_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/"
full_url = root_url + "cylinder-bands/bands.data"
dataset_location = os.path.join("datasets","cylinder-bands")
filename = 'cylinder-bands.csv'
def fetch_data(filename, full_url=full_url, dataset_location=dataset_location):
if not os.path.isdir(dataset_location):
os.makedirs(dataset_location)
file_path = os.path.join(dataset_location, filename)
urllib.request.urlretrieve(full_url, file_path)
In [23]:
fetch_data("cylinder-bands.csv")
In [24]:
def load_data(filename, dataset_location=dataset_location):
csv_path = os.path.join(dataset_location, filename)
return pd.read_csv(csv_path, header=None)
In [25]:
cylinder_bands = load_data("cylinder-bands.csv")
cylinder_bands.head()
Out[25]:
In [26]:
attributes = [
'timestamp',
'cylinder number',
'customer',
'job number',
'grain screened',
'ink color',
'proof on ctd ink',
'blade mfg',
'cylinder divisional',
'paper type',
'ink type',
'direct stream',
'solvent type',
'type on cylinder',
'press type',
'press',
'unit number',
'cylinder size',
'paper mill location',
'plating tank',
'proof cut',
'viscosity',
'caliper',
'ink temperature',
'humidity',
'roughness',
'blade pressure',
'varnish pct',
'press speed',
'ink pct',
'solvent pct',
'ESA Voltage',
'ESA Amperage',
'wax',
'hardener',
'roller durometer',
'current density',
'anode space ratio',
'chrome content',
'band type',
]
In [27]:
len(attributes)
Out[27]:
In [28]:
cylinder_bands.columns = attributes
In [29]:
cylinder_bands.head()
Out[29]:
In [30]:
cylinder_bands.info()
In [31]:
dt_attributes = [
'timestamp',
]
cat_attributes = [
'cylinder number',
'customer',
'job number',
'grain screened',
'ink color',
'proof on ctd ink',
'blade mfg',
'cylinder divisional',
'paper type',
'ink type',
'direct stream',
'solvent type',
'type on cylinder',
'press type',
'press',
'unit number',
'cylinder size',
'paper mill location',
'plating tank',
]
num_attributes = [
'proof cut',
'viscosity',
'caliper',
'ink temperature',
'humidity',
'roughness',
'blade pressure',
'varnish pct',
'press speed',
'ink pct',
'solvent pct',
'ESA Voltage',
'ESA Amperage',
'wax',
'hardener',
'roller durometer',
'current density',
'anode space ratio',
'chrome content',
]
In [32]:
def contextual_convert(df, dt_attributes=dt_attributes, num_attributes=num_attributes, cat_attributes=cat_attributes):
attributes = dt_attributes + num_attributes + cat_attributes
for att in attributes:
if att in dt_attributes:
df[att] = pd.to_datetime(df[att], format="%Y%m%d", errors='coerce')
elif att in num_attributes:
df[att] = pd.to_numeric(df[att], errors='coerce')
elif att in cat_attributes:
df[att] = df[att].astype('category')
else:
pass
In [33]:
contextual_convert(cylinder_bands)
In [34]:
cylinder_bands[dt_attributes].info()
In [35]:
cylinder_bands[cat_attributes].info()
In [36]:
cylinder_bands[num_attributes].info()
In [37]:
cylinder_bands['customer'].str.upper()
Out[37]:
In [38]:
cylinder_bands['customer'].value_counts()
Out[38]:
In [53]:
def capitalize_columns(df, cols):
for col in cols:
df[col] = df[col].str.upper()
In [39]:
cylinder_bands['customer'] = cylinder_bands['customer'].str.upper()
In [54]:
capitalize_columns(cylinder_bands, ['customer', 'cylinder number'])
In [58]:
from category_encoders import OrdinalEncoder, OneHotEncoder
In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin
# Create a class to select numerical or categorical columns
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
In [65]:
cylinder_bands['cylinder size'].value_counts()
Out[65]:
In [74]:
cylinder_bands['plating tank'].value_counts()
Out[74]:
In [67]:
cylinder_bands[cylinder_bands['cylinder size'] == "0.7"]
Out[67]:
In [68]:
raw_cylinder_bands = load_data(filename)
In [70]:
raw_cylinder_bands.iloc[522]
Out[70]: