notebook.community

Edit and run



In [1]:

    
import sqlite3
import pandas
con = sqlite3.connect('coal.db')
coal_labels = pandas.read_sql("select orispl_code, unitid, name, latitude, longitude, county, state, sum(CAST(gload as float)) as total_gen from data group by orispl_code, unitid", con)
con = sqlite3.connect('NG.db')
NG_labels = pandas.read_sql("select orispl_code, unitid, name, latitude, longitude, county, state, sum(CAST(gload as float)) as total_gen from data group by orispl_code, unitid", con)
coal_labels.to_csv('coal_labels.csv')
NG_labels.to_csv('NG_labels.csv')



In [2]:

    
import pandas
coal_labels = pandas.read_csv('coal_labels.csv')
NG_labels = pandas.read_csv('NG_labels.csv')



In [3]:

    
# Step 1: loading data
import csv
import numpy as np
import os.path
to_ml = []
to_plot = []
labels = []
for i, row in coal_labels.iterrows():
    for year in range(2001, 2017):
        file_name = 'cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        with open(file_name, 'rb') as csvfile:
            dr = csv.DictReader(csvfile)
            to_insert = []
            for row in dr:
                to_insert.append(float( row['capacity_factor_hr'] if row['capacity_factor_hr'] is not '' else 0))
                if int(row['']) % 24 is 23:
                    to_plot.append(to_insert)
                    to_ml.append(to_insert + np.gradient(to_insert).tolist())
                    labels.append((row['op_date'], row['name'], row['unitid'], 'Coal'))
                    to_insert = []
import cPickle as pickle
with open('labels.p', 'wb') as fp:
    pickle.dump(labels, fp)
with open('to_plot.p', 'wb') as fp:
    pickle.dump(to_plot, fp)
with open('to_ml.p', 'wb') as fp:
    pickle.dump(to_ml, fp)



In [5]:

    
import cPickle as pickle
with open('labels.p', 'rb') as fp:
    labels = pickle.load(fp)
with open('to_plot.p', 'rb') as fp:
    to_plot = pickle.load(fp)   
with open('to_ml.p', 'rb') as fp:
    to_ml = pickle.load(fp)



In [6]:

    
import csv
import numpy as np
import os.path
for i, row in NG_labels.iterrows():
    for year in range(2001, 2017):
        file_name = 'cf/' + str(row['orispl_code']) + '_' + row['unitid'].replace('*', '') + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        with open(file_name, 'rb') as csvfile:
            dr = csv.DictReader(csvfile)
            to_insert = []
            for row in dr:
                to_insert.append(float( row['capacity_factor_hr'] if row['capacity_factor_hr'] is not '' else 0))
                if int(row['']) % 24 is 23:
                    to_plot.append(to_insert)
                    to_ml.append(to_insert + np.gradient(to_insert).tolist())
                    labels.append((row['op_date'], row['name'], row['unitid'], 'Natural Gas'))
                    to_insert = []
import cPickle as pickle
with open('labels.p', 'wb') as fp:
    pickle.dump(labels, fp)
with open('to_plot.p', 'wb') as fp:
    pickle.dump(to_plot, fp)
with open('to_ml.p', 'wb') as fp:
    pickle.dump(to_ml, fp)









    



---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-6-8e98709a4a17> in <module>()
     13                 if int(row['']) % 24 is 23:
     14                     to_plot.append(to_insert)
---> 15                     to_ml.append(to_insert + np.gradient(to_insert).tolist())
     16                     labels.append((row['op_date'], row['name'], row['unitid'], 'Natural Gas'))
     17                     to_insert = []

MemoryError:



In [1]:

    
# Lets you pick up where you left off before - using stored labels and data and such
import cPickle as pickle
with open('labels.p', 'rb') as fp:
    labels = pickle.load(fp)
with open('to_plot.p', 'rb') as fp:
    to_plot = pickle.load(fp)   
with open('to_ml.p', 'rb') as fp:
    to_ml = pickle.load(fp)



In [ ]:

    
# Step 2: normalize data
from sklearn.preprocessing import normalize
normalized = normalize(to_ml)



In [ ]:

    
# Step 3: run kmeans
clusters = 20
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=clusters, random_state=0).fit(normalized)



In [ ]:

    
output = kmeans.labels_



In [ ]:

    
import shutil
import os.path
if os.path.exists('kmeans_' + str(clusters)):
    shutil.rmtree('kmeans_' + str(clusters))



In [ ]:

    
# Save images to inspect results
import os.path
import matplotlib.pyplot as plt
import random
counts = clusters * [0]
totals = clusters * [np.zeros(24)]
for i in range(clusters):
    if not os.path.exists('kmeans_' + str(clusters)+ '/' + str(i)):
        os.makedirs('kmeans_' + str(clusters) + '/' + str(i))
for i in range(len(normalized)):
    counts[output[i]] += 1
    totals[output[i]] = to_plot[i] + totals[output[i]]
    # Sample .1% of days (~500 total) to verify that clusters are working 
    if random.randint(1, 1000) is not 100: continue
    plt.gcf().clear()
    plt.axis([0, 24, 0, 1])
    plt.xlabel('Hour')
    plt.ylabel('Capacity Factor')
    plt.plot(to_plot[i])
    plt.title(str(labels[i][1]) + " unit " + str(labels[i][2]) + " operation on " + str(labels[i][0]) + " (" + labels[i][3] + ")")
    plt.savefig('kmeans_' + str(clusters) + '/' + str(output[i]) + '/' + str(labels[i][1]) + '_' + str(labels[i][2]) + '_' + str(labels[i][0]) + '.png')



In [ ]:

    
for i in range(clusters):
    values = totals[i] / counts[i]
    plt.gcf().clear()
    plt.axis([0, 24, 0, 1])
    plt.plot(values)
    plt.title("Cluster " + str(i) + " Average Shape")
    plt.xlabel('Hour')
    plt.ylabel('Capacity Factor')
    plt.savefig('kmeans_' + str(clusters) + '/average' + str(i) + ".png")
    plt.show()



In [10]:

    
def get_counts(filename):
    to_test = []
    with open(filename, 'rb') as csvfile:
        dr = csv.DictReader(csvfile)
        to_insert = []
        for row in dr:
            to_insert.append(float(row['capacity_factor']))
            if int(row['']) % 24 is 23:
                to_test.append(to_insert + np.gradient(to_insert).tolist())
                to_insert = []
    if to_test == []: return clusters * [0]
    result = kmeans.predict(normalize(to_test))
    counts = clusters * [0]
    for i in range(len(result)):
        counts[result[i]] += 1
    return counts



In [13]:

    
def get_year(year):
    to_test = []
    for i, row in plant_labels.iterrows():
        file_name = 'cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        with open(file_name, 'rb') as csvfile:
            dr = csv.DictReader(csvfile)
            to_insert = []
            for row in dr:
                to_insert.append(float( row['capacity_factor'] if row['capacity_factor'] is not '' else 0))
                if int(row['']) % 24 is 23:
                    to_test.append(to_insert + np.gradient(to_insert).tolist())
                    to_insert = []
    result = kmeans.predict(normalize(to_test))
    counts = clusters * [0]
    for i in range(len(result)):
        counts[result[i]] += 1
    return counts

def get_plant_year(year, orispl_code):
    to_test = []
    unitids = []
    for i, row in plant_labels.iterrows():
        if row['orispl_code'] == orispl_code:
            unitids.append(row['unitid'])
    for unitid in unitids:
        file_name = 'cf/' + str(orispl_code) + '_' + str(unitid) + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        with open(file_name, 'rb') as csvfile:
            dr = csv.DictReader(csvfile)
            to_insert = []
            for row in dr:
                to_insert.append(float( row['capacity_factor'] if row['capacity_factor'] is not '' else 0))
                if int(row['']) % 24 is 23:
                    to_test.append(to_insert + np.gradient(to_insert).tolist())
                    to_insert = []
    result = kmeans.predict(normalize(to_test))
    counts = clusters * [0]
    for i in range(len(result)):
        counts[result[i]] += 1
    return counts

def get_unit_year(year, orispl_code, unitid):
    return get_counts('cf/' + str(orispl_code) + '_' + str(unitid) + '_' + str(year) + '.csv')



In [14]:

    
code = 4941
for year in [2001, 2006, 2011, 2016]:
    for entry in get_year(year):
        print entry



In [33]:

    
print len(normalized)



In [124]:

    
code = 4941
unitid = 3
for year in [2001, 2006, 2011, 2016]:
    for entry in get_unit_year(year, code, unitid):
        print entry



In [20]:

    
def get_month_counts(filename):
    to_test = {}
    counts = {}
    for month in range(1, 13):
        to_test[month] = []
        counts[month] = clusters * [0]
    with open(filename, 'rb') as csvfile:
        dr = csv.DictReader(csvfile)
        to_insert = []
        for row in dr:
            to_insert.append(float(row['capacity_factor']))
            if int(row['op_hour']) is 23:
                month = int(row['op_date'][0:2])
                to_test[month].append(to_insert + np.gradient(to_insert).tolist())
                to_insert = []
    for month in range(1, 13):
        if to_test[month] == []: return counts
        result = kmeans.predict(normalize(to_test[month]))
        for i in range(len(result)):
            counts[month][result[i]] += 1
    return counts



In [21]:

    
dictionary = {}
for year in range(2002, 2017):
    dictionary[year] = {}
    for i, row in plant_labels.iterrows():
        file_name = 'cf/' + str(row['orispl_code']) + '_' + str(row['unitid']) + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        counts = get_month_counts(file_name)
        count = 0
        for month in range(1, 13):
            dictionary[year][month] = []
            count += sum(counts[month])
        if count is 0: continue
        for month in range(1, 13):
            percent = float(counts[month][6])/sum(counts[month])
            dictionary[year][month].append(percent)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-82bcdd40b313> in <module>()
      5         file_name = 'cf/' + str(row['orispl_code']) + '_' + str(row['unitid']) + '_' + str(year) + '.csv'
      6         if not os.path.isfile(file_name): continue
----> 7         counts = get_month_counts(file_name)
      8         count = 0
      9         for month in range(1, 13):

<ipython-input-20-73c4fa0119e8> in get_month_counts(filename)
     16     for month in range(1, 13):
     17         if to_test[month] == []: return counts
---> 18         result = kmeans.predict(normalize(to_test[month]))
     19         for i in range(len(result)):
     20             counts[month][result[i]] += 1

c:\python27\lib\site-packages\sklearn\preprocessing\data.pyc in normalize(X, norm, axis, copy, return_norm)
   1342 
   1343     X = check_array(X, sparse_format, copy=copy, warn_on_dtype=True,
-> 1344                     estimator='the normalize function', dtype=FLOAT_DTYPES)
   1345     if axis == 0:
   1346         X = X.T

c:\python27\lib\site-packages\sklearn\utils\validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    380                                       force_all_finite)
    381     else:
--> 382         array = np.array(array, dtype=dtype, order=order, copy=copy)
    383 
    384         if ensure_2d:

ValueError: setting an array element with a sequence.



In [22]:

    
print month
print to_test









    



12






    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-7d3c048e912d> in <module>()
      1 print month
----> 2 print to_test

NameError: name 'to_test' is not defined



In [116]:

    
dictionary = {}
for year in range(2001, 2017):
    dictionary[year] =[]
    for i, row in plant_labels.iterrows():
        file_name = 'cf/' + str(row['orispl_code']) + '_' + str(row['unitid']) + '_' + str(year) + '.csv'
        if not os.path.isfile(file_name): continue
        counts = get_counts(file_name)
        if sum(counts) == 0: continue
        percent = float(counts[6])/sum(counts)
        if percent > 0.7 and year > 2010:
            print year, row
        dictionary[year].append(percent)









    



2011 orispl_code    113
unitid           1
Name: 1, dtype: object
2011 orispl_code    55479
unitid           001
Name: 96, dtype: object
2011 orispl_code    56319
unitid           001
Name: 99, dtype: object
2011 orispl_code    56596
unitid           001
Name: 100, dtype: object
2012 orispl_code    7504
unitid          001
Name: 80, dtype: object
2012 orispl_code    55479
unitid           001
Name: 96, dtype: object
2012 orispl_code    56319
unitid           001
Name: 99, dtype: object
2012 orispl_code    56596
unitid           001
Name: 100, dtype: object
2013 orispl_code    6761
unitid          101
Name: 79, dtype: object
2013 orispl_code    7504
unitid          001
Name: 80, dtype: object
2013 orispl_code    55479
unitid           001
Name: 96, dtype: object
2013 orispl_code    56319
unitid           001
Name: 99, dtype: object
2013 orispl_code    56596
unitid           001
Name: 100, dtype: object
2013 orispl_code    56609
unitid            01
Name: 101, dtype: object
2014 orispl_code    7504
unitid          001
Name: 80, dtype: object
2014 orispl_code    55479
unitid           001
Name: 96, dtype: object
2014 orispl_code    56319
unitid           001
Name: 99, dtype: object
2014 orispl_code    56596
unitid           001
Name: 100, dtype: object
2014 orispl_code    56609
unitid            01
Name: 101, dtype: object
2015 orispl_code    7504
unitid          001
Name: 80, dtype: object
2015 orispl_code    8219
unitid            1
Name: 88, dtype: object
2015 orispl_code    55479
unitid           001
Name: 96, dtype: object
2015 orispl_code    56319
unitid           001
Name: 99, dtype: object
2015 orispl_code    56596
unitid           001
Name: 100, dtype: object
2015 orispl_code    56609
unitid            01
Name: 101, dtype: object
2016 orispl_code    50951
unitid             1
Name: 95, dtype: object
2016 orispl_code    55479
unitid           001
Name: 96, dtype: object
2016 orispl_code    56596
unitid           001
Name: 100, dtype: object



In [102]:

    
to_csv = {}
field_names = []
field_names.append("Year")
for min_percentage in range(10):
    field_names.append(str(float(min_percentage) / 10) + " <= x < " + str(float(min_percentage + 1) / 10))

for year in dictionary:
    to_csv[year] = {}
    for min_percentage in range(10):
        count = 0
        for entry in dictionary[year]:
            if entry >= float(min_percentage) / 10 and entry < float(min_percentage + 1) / 10:
                count += 1
        to_csv[year][field_names[min_percentage + 1]] = count
    to_csv[year]['Year'] = year
with open('percent_baseload.csv', 'wb') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    writer.writeheader()
    for year in to_csv:
        writer.writerow(to_csv[year])



In [2]:

    
year_labels = {}
for year in [2001, 2006, 2011, 2016]:
    year_labels[year] = pandas.read_sql("select orispl_code, unitid, name, latitude, longitude, county, state, sum(CAST(gload as float)) as total_gen from data where year is " + str(year) + " group by orispl_code, unitid", con)



In [14]:

    
# Making a map visualization
import csv
fieldnames = ['Name', 'Unit ID', 'Latitude', 'Longitude', 'County', '% of days in baseload', 'Generation']
for year in [2001, 2006, 2011, 2016]:
    with open('k-means/baseload_map_' + str(year) + '.csv', 'wb') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i, row in year_labels[year].iterrows():
            file_name = 'cf/' + str(row['orispl_code']) + '_' + str(row['unitid']) + '_' + str(year) + '.csv'
            if not os.path.isfile(file_name): continue
            counts = get_counts(file_name)
            if sum(counts) == 0: continue
            percent = float(counts[6])/sum(counts)
            writer.writerow({'Name':row['name'], 'Unit ID':row['unitid'], 'Latitude':row['latitude'], 'Longitude': row['longitude'], 'County':row['county'] + ", " + row['state'], '% of days in baseload': percent, 'Generation': row['total_gen']})



In [143]:



In [ ]: