Six Sigma Control Chart

Implement a basic $6\sigma$ control chart.

Detect for a process that is out of control or has a drifting centerline.


In [2]:
import os
import sys
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
font = {'size': 15}
matplotlib.rc('font', **font)

curdir = !pwd
rootdir = os.path.abspath(curdir[0])
sys.path.append(rootdir)
import control_chart

def get_columns(columns):
    with open(os.path.join(rootdir, 'data.csv')) as f:
        for line in f:
            yield [line.split(',')[i - 1] for i in columns]

In [3]:
def timeseries(col, sort_by_val=False):
    data = np.array(list(get_columns([col])), dtype=np.float)
    fig = plt.figure(figsize=(15,8))
    ax = plt.subplot(111)
    plt.grid(lw=2)
    if sort_by_val:
        plt.plot(sorted(data), 'bo')
    else:
        plt.plot(data, 'bo')
    plt.xlabel("Column %d" % col)
    plt.show()

In [4]:
timeseries(3)


Questions

  • Would you consider the variablity to be normally distributed? $\left(\sim N(\mu, \sigma^2) \right)$
  • How can you determine how close to normally distributed it is?

In [5]:
def histogram(col):
    data = np.array(list(get_columns([col])), dtype=np.float)
    fig = plt.figure(figsize=(15,8))
    ax = plt.subplot(111)
    plt.grid(lw=2)
    plt.hist(data, bins=20)
    plt.xlabel("Column %d" % col)
    plt.show()
histogram(3)



In [6]:
timeseries(3, sort_by_val=True)


What does normally distributed (Gaussian) data look like?


In [7]:
def normdist(samples):
    data = np.random.normal(size=samples)
    fig = plt.figure(figsize=(15,10))
    ax = plt.subplot(211)
    plt.grid(lw=2)
    plt.hist(data, bins=20)
    ax = plt.subplot(212)
    plt.grid(lw=2)
    plt.plot(sorted(data), 'bo')
    plt.show()

In [8]:
normdist(20)



In [9]:
normdist(1000)


What does the data look like?


In [10]:
column = 3
mean = -3
stdev = 0.5

In [11]:
with open(os.path.join(rootdir, 'data.csv')) as f:
    
    fig = plt.figure(figsize=(15,8))
    ax = plt.subplot(211)
    data = [float(line.split(',')[column - 1]) for line in f]
    plt.plot(data, 'bo')
    xmin, xmax = ax.get_xlim()
    colors = 'kgyr'
    for z in range(-3, 4):
        plt.hlines(mean + z * stdev, xmin, xmax, colors[abs(z)], lw=4)
    plt.grid(lw=2)
    plt.title("Time Series Control Chart")
    
    ax = plt.subplot(212)
    plt.hist(data, bins=15)
    plt.grid(lw=2)
    plt.show()


Run all points through the control chart tests and error out on any failures


In [12]:
with open(os.path.join(rootdir, 'data.csv')) as f:
    
    # Produce the values from the specified column for
    # each line in the csv file.
    data = [line.split(',')[column - 1] for line in f]
    
    class Opts:
        m = mean
        s = stdev
    
    stream = control_chart.load_stream(data)
    control_chart.control_chart(stream, Opts())


[-2.9159 -3.1631 -3.271  -3.0966 -3.1207 -4.199  -3.3572 -3.1461 -3.7104
 -3.1761 -3.4669] is 10/11 points < centerline
[-1.8734 -2.8812 -1.9988] is 2/3 points > 2 sigma
[-1.8734 -3.5352 -1.9988] is 2/3 points > 2 sigma
[-3.1486 -3.7457 -3.227  -3.0132 -3.7735 -3.5737 -3.249  -3.2668] is 8/8 points < centerline
[-3.1486 -3.7457 -3.227  -3.0132 -3.7735 -3.5737 -3.249  -3.1931 -3.4613
 -2.8118 -3.2668] is 10/11 points < centerline

In [12]: