In [1]:
import sqlite3
import pandas
con = sqlite3.connect('coal.db')
coal_labels = pandas.read_sql("select orispl_code, unitid, name, latitude, longitude, county, state, sum(CAST(gload as float)) as total_gen from data group by orispl_code, unitid", con)
con = sqlite3.connect('NG.db')
NG_labels = pandas.read_sql("select orispl_code, unitid, name, latitude, longitude, county, state, sum(CAST(gload as float)) as total_gen from data group by orispl_code, unitid", con)
coal_labels.to_csv('coal_labels.csv')
NG_labels.to_csv('NG_labels.csv')

In [2]:
import pandas
coal_labels = pandas.read_csv('coal_labels.csv')
NG_labels = pandas.read_csv('NG_labels.csv')

In [3]:
# Step 1: loading data
import csv
import numpy as np
import os.path
to_ml = []
to_plot = []
labels = []
for i, row in coal_labels.iterrows():
    for year in range(2001, 2017):
        file_name = 'cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        with open(file_name, 'rb') as csvfile:
            dr = csv.DictReader(csvfile)
            to_insert = []
            for row in dr:
                to_insert.append(float( row['capacity_factor_hr'] if row['capacity_factor_hr'] is not '' else 0))
                if int(row['']) % 24 is 23:
                    to_plot.append(to_insert)
                    to_ml.append(to_insert + np.gradient(to_insert).tolist())
                    labels.append((row['op_date'], row['name'], row['unitid'], 'Coal'))
                    to_insert = []
import cPickle as pickle
with open('labels.p', 'wb') as fp:
    pickle.dump(labels, fp)
with open('to_plot.p', 'wb') as fp:
    pickle.dump(to_plot, fp)
with open('to_ml.p', 'wb') as fp:
    pickle.dump(to_ml, fp)

In [5]:
import cPickle as pickle
with open('labels.p', 'rb') as fp:
    labels = pickle.load(fp)
with open('to_plot.p', 'rb') as fp:
    to_plot = pickle.load(fp)   
with open('to_ml.p', 'rb') as fp:
    to_ml = pickle.load(fp)

In [6]:
import csv
import numpy as np
import os.path
for i, row in NG_labels.iterrows():
    for year in range(2001, 2017):
        file_name = 'cf/' + str(row['orispl_code']) + '_' + row['unitid'].replace('*', '') + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        with open(file_name, 'rb') as csvfile:
            dr = csv.DictReader(csvfile)
            to_insert = []
            for row in dr:
                to_insert.append(float( row['capacity_factor_hr'] if row['capacity_factor_hr'] is not '' else 0))
                if int(row['']) % 24 is 23:
                    to_plot.append(to_insert)
                    to_ml.append(to_insert + np.gradient(to_insert).tolist())
                    labels.append((row['op_date'], row['name'], row['unitid'], 'Natural Gas'))
                    to_insert = []
import cPickle as pickle
with open('labels.p', 'wb') as fp:
    pickle.dump(labels, fp)
with open('to_plot.p', 'wb') as fp:
    pickle.dump(to_plot, fp)
with open('to_ml.p', 'wb') as fp:
    pickle.dump(to_ml, fp)


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-6-8e98709a4a17> in <module>()
     13                 if int(row['']) % 24 is 23:
     14                     to_plot.append(to_insert)
---> 15                     to_ml.append(to_insert + np.gradient(to_insert).tolist())
     16                     labels.append((row['op_date'], row['name'], row['unitid'], 'Natural Gas'))
     17                     to_insert = []

MemoryError: 

In [1]:
# Lets you pick up where you left off before - using stored labels and data and such
import cPickle as pickle
with open('labels.p', 'rb') as fp:
    labels = pickle.load(fp)
with open('to_plot.p', 'rb') as fp:
    to_plot = pickle.load(fp)   
with open('to_ml.p', 'rb') as fp:
    to_ml = pickle.load(fp)

In [ ]:
# Step 2: normalize data
from sklearn.preprocessing import normalize
normalized = normalize(to_ml)

In [ ]:
# Step 3: run kmeans
clusters = 20
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=clusters, random_state=0).fit(normalized)

In [ ]:
output = kmeans.labels_

In [ ]:
import shutil
import os.path
if os.path.exists('kmeans_' + str(clusters)):
    shutil.rmtree('kmeans_' + str(clusters))

In [ ]:
# Save images to inspect results
import os.path
import matplotlib.pyplot as plt
import random
counts = clusters * [0]
totals = clusters * [np.zeros(24)]
for i in range(clusters):
    if not os.path.exists('kmeans_' + str(clusters)+ '/' + str(i)):
        os.makedirs('kmeans_' + str(clusters) + '/' + str(i))
for i in range(len(normalized)):
    counts[output[i]] += 1
    totals[output[i]] = to_plot[i] + totals[output[i]]
    # Sample .1% of days (~500 total) to verify that clusters are working 
    if random.randint(1, 1000) is not 100: continue
    plt.gcf().clear()
    plt.axis([0, 24, 0, 1])
    plt.xlabel('Hour')
    plt.ylabel('Capacity Factor')
    plt.plot(to_plot[i])
    plt.title(str(labels[i][1]) + " unit " + str(labels[i][2]) + " operation on " + str(labels[i][0]) + " (" + labels[i][3] + ")")
    plt.savefig('kmeans_' + str(clusters) + '/' + str(output[i]) + '/' + str(labels[i][1]) + '_' + str(labels[i][2]) + '_' + str(labels[i][0]) + '.png')

In [ ]:
for i in range(clusters):
    values = totals[i] / counts[i]
    plt.gcf().clear()
    plt.axis([0, 24, 0, 1])
    plt.plot(values)
    plt.title("Cluster " + str(i) + " Average Shape")
    plt.xlabel('Hour')
    plt.ylabel('Capacity Factor')
    plt.savefig('kmeans_' + str(clusters) + '/average' + str(i) + ".png")
    plt.show()

In [10]:
def get_counts(filename):
    to_test = []
    with open(filename, 'rb') as csvfile:
        dr = csv.DictReader(csvfile)
        to_insert = []
        for row in dr:
            to_insert.append(float(row['capacity_factor']))
            if int(row['']) % 24 is 23:
                to_test.append(to_insert + np.gradient(to_insert).tolist())
                to_insert = []
    if to_test == []: return clusters * [0]
    result = kmeans.predict(normalize(to_test))
    counts = clusters * [0]
    for i in range(len(result)):
        counts[result[i]] += 1
    return counts

In [13]:
def get_year(year):
    to_test = []
    for i, row in plant_labels.iterrows():
        file_name = 'cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        with open(file_name, 'rb') as csvfile:
            dr = csv.DictReader(csvfile)
            to_insert = []
            for row in dr:
                to_insert.append(float( row['capacity_factor'] if row['capacity_factor'] is not '' else 0))
                if int(row['']) % 24 is 23:
                    to_test.append(to_insert + np.gradient(to_insert).tolist())
                    to_insert = []
    result = kmeans.predict(normalize(to_test))
    counts = clusters * [0]
    for i in range(len(result)):
        counts[result[i]] += 1
    return counts

def get_plant_year(year, orispl_code):
    to_test = []
    unitids = []
    for i, row in plant_labels.iterrows():
        if row['orispl_code'] == orispl_code:
            unitids.append(row['unitid'])
    for unitid in unitids:
        file_name = 'cf/' + str(orispl_code) + '_' + str(unitid) + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        with open(file_name, 'rb') as csvfile:
            dr = csv.DictReader(csvfile)
            to_insert = []
            for row in dr:
                to_insert.append(float( row['capacity_factor'] if row['capacity_factor'] is not '' else 0))
                if int(row['']) % 24 is 23:
                    to_test.append(to_insert + np.gradient(to_insert).tolist())
                    to_insert = []
    result = kmeans.predict(normalize(to_test))
    counts = clusters * [0]
    for i in range(len(result)):
        counts[result[i]] += 1
    return counts

def get_unit_year(year, orispl_code, unitid):
    return get_counts('cf/' + str(orispl_code) + '_' + str(unitid) + '_' + str(year) + '.csv')

In [14]:
code = 4941
for year in [2001, 2006, 2011, 2016]:
    for entry in get_year(year):
        print entry


3261
2517
472
191
283
1289
223
354
282
78
2295
1379
317
616
1675
36
7891
357
9605
218
2665
3446
451
185
375
631
174
330
159
70
2241
1313
261
507
1707
9
8416
324
9934
165
3351
4437
995
232
411
2420
191
501
270
110
3602
1183
304
846
4661
22
5169
405
5542
269
2595
6539
2466
418
262
1763
137
328
81
156
2111
1543
256
617
2895
15
3676
308
4254
386

In [33]:
print len(normalized)


535393

In [124]:
code = 4941
unitid = 3
for year in [2001, 2006, 2011, 2016]:
    for entry in get_unit_year(year, code, unitid):
        print entry


22
37
62
10
4
0
193
1
1
1
12
3
5
3
0
5
1
0
0
5
29
32
56
5
4
0
211
0
1
6
0
3
9
1
2
3
0
2
0
1
71
6
53
3
8
0
166
2
6
9
15
3
10
6
0
5
1
1
0
0
71
33
8
62
6
1
11
12
20
20
32
4
7
11
7
8
1
50
0
2

In [20]:
def get_month_counts(filename):
    to_test = {}
    counts = {}
    for month in range(1, 13):
        to_test[month] = []
        counts[month] = clusters * [0]
    with open(filename, 'rb') as csvfile:
        dr = csv.DictReader(csvfile)
        to_insert = []
        for row in dr:
            to_insert.append(float(row['capacity_factor']))
            if int(row['op_hour']) is 23:
                month = int(row['op_date'][0:2])
                to_test[month].append(to_insert + np.gradient(to_insert).tolist())
                to_insert = []
    for month in range(1, 13):
        if to_test[month] == []: return counts
        result = kmeans.predict(normalize(to_test[month]))
        for i in range(len(result)):
            counts[month][result[i]] += 1
    return counts

In [21]:
dictionary = {}
for year in range(2002, 2017):
    dictionary[year] = {}
    for i, row in plant_labels.iterrows():
        file_name = 'cf/' + str(row['orispl_code']) + '_' + str(row['unitid']) + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        counts = get_month_counts(file_name)
        count = 0
        for month in range(1, 13):
            dictionary[year][month] = []
            count += sum(counts[month])
        if count is 0: continue
        for month in range(1, 13):
            percent = float(counts[month][6])/sum(counts[month])
            dictionary[year][month].append(percent)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-82bcdd40b313> in <module>()
      5         file_name = 'cf/' + str(row['orispl_code']) + '_' + str(row['unitid']) + '_' + str(year) + '.csv'
      6         if not os.path.isfile(file_name): continue
----> 7         counts = get_month_counts(file_name)
      8         count = 0
      9         for month in range(1, 13):

<ipython-input-20-73c4fa0119e8> in get_month_counts(filename)
     16     for month in range(1, 13):
     17         if to_test[month] == []: return counts
---> 18         result = kmeans.predict(normalize(to_test[month]))
     19         for i in range(len(result)):
     20             counts[month][result[i]] += 1

c:\python27\lib\site-packages\sklearn\preprocessing\data.pyc in normalize(X, norm, axis, copy, return_norm)
   1342 
   1343     X = check_array(X, sparse_format, copy=copy, warn_on_dtype=True,
-> 1344                     estimator='the normalize function', dtype=FLOAT_DTYPES)
   1345     if axis == 0:
   1346         X = X.T

c:\python27\lib\site-packages\sklearn\utils\validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    380                                       force_all_finite)
    381     else:
--> 382         array = np.array(array, dtype=dtype, order=order, copy=copy)
    383 
    384         if ensure_2d:

ValueError: setting an array element with a sequence.

In [22]:
print month
print to_test


12
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-7d3c048e912d> in <module>()
      1 print month
----> 2 print to_test

NameError: name 'to_test' is not defined

In [116]:
dictionary = {}
for year in range(2001, 2017):
    dictionary[year] =[]
    for i, row in plant_labels.iterrows():
        file_name = 'cf/' + str(row['orispl_code']) + '_' + str(row['unitid']) + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        counts = get_counts(file_name)
        if sum(counts) == 0: continue
        percent = float(counts[6])/sum(counts)
        if percent > 0.7 and year > 2010:
            print year, row
        dictionary[year].append(percent)


2011 orispl_code    113
unitid           1
Name: 1, dtype: object
2011 orispl_code    55479
unitid           001
Name: 96, dtype: object
2011 orispl_code    56319
unitid           001
Name: 99, dtype: object
2011 orispl_code    56596
unitid           001
Name: 100, dtype: object
2012 orispl_code    7504
unitid          001
Name: 80, dtype: object
2012 orispl_code    55479
unitid           001
Name: 96, dtype: object
2012 orispl_code    56319
unitid           001
Name: 99, dtype: object
2012 orispl_code    56596
unitid           001
Name: 100, dtype: object
2013 orispl_code    6761
unitid          101
Name: 79, dtype: object
2013 orispl_code    7504
unitid          001
Name: 80, dtype: object
2013 orispl_code    55479
unitid           001
Name: 96, dtype: object
2013 orispl_code    56319
unitid           001
Name: 99, dtype: object
2013 orispl_code    56596
unitid           001
Name: 100, dtype: object
2013 orispl_code    56609
unitid            01
Name: 101, dtype: object
2014 orispl_code    7504
unitid          001
Name: 80, dtype: object
2014 orispl_code    55479
unitid           001
Name: 96, dtype: object
2014 orispl_code    56319
unitid           001
Name: 99, dtype: object
2014 orispl_code    56596
unitid           001
Name: 100, dtype: object
2014 orispl_code    56609
unitid            01
Name: 101, dtype: object
2015 orispl_code    7504
unitid          001
Name: 80, dtype: object
2015 orispl_code    8219
unitid            1
Name: 88, dtype: object
2015 orispl_code    55479
unitid           001
Name: 96, dtype: object
2015 orispl_code    56319
unitid           001
Name: 99, dtype: object
2015 orispl_code    56596
unitid           001
Name: 100, dtype: object
2015 orispl_code    56609
unitid            01
Name: 101, dtype: object
2016 orispl_code    50951
unitid             1
Name: 95, dtype: object
2016 orispl_code    55479
unitid           001
Name: 96, dtype: object
2016 orispl_code    56596
unitid           001
Name: 100, dtype: object

In [102]:
to_csv = {}
field_names = []
field_names.append("Year")
for min_percentage in range(10):
    field_names.append(str(float(min_percentage) / 10) + " <= x < " + str(float(min_percentage + 1) / 10))

for year in dictionary:
    to_csv[year] = {}
    for min_percentage in range(10):
        count = 0
        for entry in dictionary[year]:
            if entry >= float(min_percentage) / 10 and entry < float(min_percentage + 1) / 10:
                count += 1
        to_csv[year][field_names[min_percentage + 1]] = count
    to_csv[year]['Year'] = year
with open('percent_baseload.csv', 'wb') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    for year in to_csv:
        writer.writerow(to_csv[year])

In [2]:
year_labels = {}
for year in [2001, 2006, 2011, 2016]:
    year_labels[year] = pandas.read_sql("select orispl_code, unitid, name, latitude, longitude, county, state, sum(CAST(gload as float)) as total_gen from data where year is " + str(year) + " group by orispl_code, unitid", con)

In [14]:
# Making a map visualization
import csv
fieldnames = ['Name', 'Unit ID', 'Latitude', 'Longitude', 'County', '% of days in baseload', 'Generation']
for year in [2001, 2006, 2011, 2016]:
    with open('k-means/baseload_map_' + str(year) + '.csv', 'wb') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i, row in year_labels[year].iterrows():
            file_name = 'cf/' + str(row['orispl_code']) + '_' + str(row['unitid']) + '_' + str(year) + '.csv'
            if not os.path.isfile(file_name): continue
            counts = get_counts(file_name)
            if sum(counts) == 0: continue
            percent = float(counts[6])/sum(counts)
            writer.writerow({'Name':row['name'], 'Unit ID':row['unitid'], 'Latitude':row['latitude'], 'Longitude': row['longitude'], 'County':row['county'] + ", " + row['state'], '% of days in baseload': percent, 'Generation': row['total_gen']})

In [143]:


In [ ]: