In [1]:
import sqlite3
import pandas
con = sqlite3.connect('coal.db')
coal_labels = pandas.read_sql("select orispl_code, unitid, name, latitude, longitude, county, state, sum(CAST(gload as float)) as total_gen from data group by orispl_code, unitid", con)
con = sqlite3.connect('NG.db')
NG_labels = pandas.read_sql("select orispl_code, unitid, name, latitude, longitude, county, state, sum(CAST(gload as float)) as total_gen from data group by orispl_code, unitid", con)
coal_labels.to_csv('coal_labels.csv')
NG_labels.to_csv('NG_labels.csv')
In [2]:
import pandas
coal_labels = pandas.read_csv('coal_labels.csv')
NG_labels = pandas.read_csv('NG_labels.csv')
In [3]:
# Step 1: loading data
import csv
import numpy as np
import os.path
to_ml = []
to_plot = []
labels = []
for i, row in coal_labels.iterrows():
for year in range(2001, 2017):
file_name = 'cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.csv'
if not os.path.isfile(file_name): continue
with open(file_name, 'rb') as csvfile:
dr = csv.DictReader(csvfile)
to_insert = []
for row in dr:
to_insert.append(float( row['capacity_factor_hr'] if row['capacity_factor_hr'] is not '' else 0))
if int(row['']) % 24 is 23:
to_plot.append(to_insert)
to_ml.append(to_insert + np.gradient(to_insert).tolist())
labels.append((row['op_date'], row['name'], row['unitid'], 'Coal'))
to_insert = []
import cPickle as pickle
with open('labels.p', 'wb') as fp:
pickle.dump(labels, fp)
with open('to_plot.p', 'wb') as fp:
pickle.dump(to_plot, fp)
with open('to_ml.p', 'wb') as fp:
pickle.dump(to_ml, fp)
In [5]:
import cPickle as pickle
with open('labels.p', 'rb') as fp:
labels = pickle.load(fp)
with open('to_plot.p', 'rb') as fp:
to_plot = pickle.load(fp)
with open('to_ml.p', 'rb') as fp:
to_ml = pickle.load(fp)
In [6]:
import csv
import numpy as np
import os.path
for i, row in NG_labels.iterrows():
for year in range(2001, 2017):
file_name = 'cf/' + str(row['orispl_code']) + '_' + row['unitid'].replace('*', '') + '_' + str(year) + '.csv'
if not os.path.isfile(file_name): continue
with open(file_name, 'rb') as csvfile:
dr = csv.DictReader(csvfile)
to_insert = []
for row in dr:
to_insert.append(float( row['capacity_factor_hr'] if row['capacity_factor_hr'] is not '' else 0))
if int(row['']) % 24 is 23:
to_plot.append(to_insert)
to_ml.append(to_insert + np.gradient(to_insert).tolist())
labels.append((row['op_date'], row['name'], row['unitid'], 'Natural Gas'))
to_insert = []
import cPickle as pickle
with open('labels.p', 'wb') as fp:
pickle.dump(labels, fp)
with open('to_plot.p', 'wb') as fp:
pickle.dump(to_plot, fp)
with open('to_ml.p', 'wb') as fp:
pickle.dump(to_ml, fp)
In [1]:
# Lets you pick up where you left off before - using stored labels and data and such
import cPickle as pickle
with open('labels.p', 'rb') as fp:
labels = pickle.load(fp)
with open('to_plot.p', 'rb') as fp:
to_plot = pickle.load(fp)
with open('to_ml.p', 'rb') as fp:
to_ml = pickle.load(fp)
In [ ]:
# Step 2: normalize data
from sklearn.preprocessing import normalize
normalized = normalize(to_ml)
In [ ]:
# Step 3: run kmeans
clusters = 20
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=clusters, random_state=0).fit(normalized)
In [ ]:
output = kmeans.labels_
In [ ]:
import shutil
import os.path
if os.path.exists('kmeans_' + str(clusters)):
shutil.rmtree('kmeans_' + str(clusters))
In [ ]:
# Save images to inspect results
import os.path
import matplotlib.pyplot as plt
import random
counts = clusters * [0]
totals = clusters * [np.zeros(24)]
for i in range(clusters):
if not os.path.exists('kmeans_' + str(clusters)+ '/' + str(i)):
os.makedirs('kmeans_' + str(clusters) + '/' + str(i))
for i in range(len(normalized)):
counts[output[i]] += 1
totals[output[i]] = to_plot[i] + totals[output[i]]
# Sample .1% of days (~500 total) to verify that clusters are working
if random.randint(1, 1000) is not 100: continue
plt.gcf().clear()
plt.axis([0, 24, 0, 1])
plt.xlabel('Hour')
plt.ylabel('Capacity Factor')
plt.plot(to_plot[i])
plt.title(str(labels[i][1]) + " unit " + str(labels[i][2]) + " operation on " + str(labels[i][0]) + " (" + labels[i][3] + ")")
plt.savefig('kmeans_' + str(clusters) + '/' + str(output[i]) + '/' + str(labels[i][1]) + '_' + str(labels[i][2]) + '_' + str(labels[i][0]) + '.png')
In [ ]:
for i in range(clusters):
values = totals[i] / counts[i]
plt.gcf().clear()
plt.axis([0, 24, 0, 1])
plt.plot(values)
plt.title("Cluster " + str(i) + " Average Shape")
plt.xlabel('Hour')
plt.ylabel('Capacity Factor')
plt.savefig('kmeans_' + str(clusters) + '/average' + str(i) + ".png")
plt.show()
In [10]:
def get_counts(filename):
to_test = []
with open(filename, 'rb') as csvfile:
dr = csv.DictReader(csvfile)
to_insert = []
for row in dr:
to_insert.append(float(row['capacity_factor']))
if int(row['']) % 24 is 23:
to_test.append(to_insert + np.gradient(to_insert).tolist())
to_insert = []
if to_test == []: return clusters * [0]
result = kmeans.predict(normalize(to_test))
counts = clusters * [0]
for i in range(len(result)):
counts[result[i]] += 1
return counts
In [13]:
def get_year(year):
to_test = []
for i, row in plant_labels.iterrows():
file_name = 'cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.csv'
if not os.path.isfile(file_name): continue
with open(file_name, 'rb') as csvfile:
dr = csv.DictReader(csvfile)
to_insert = []
for row in dr:
to_insert.append(float( row['capacity_factor'] if row['capacity_factor'] is not '' else 0))
if int(row['']) % 24 is 23:
to_test.append(to_insert + np.gradient(to_insert).tolist())
to_insert = []
result = kmeans.predict(normalize(to_test))
counts = clusters * [0]
for i in range(len(result)):
counts[result[i]] += 1
return counts
def get_plant_year(year, orispl_code):
to_test = []
unitids = []
for i, row in plant_labels.iterrows():
if row['orispl_code'] == orispl_code:
unitids.append(row['unitid'])
for unitid in unitids:
file_name = 'cf/' + str(orispl_code) + '_' + str(unitid) + '_' + str(year) + '.csv'
if not os.path.isfile(file_name): continue
with open(file_name, 'rb') as csvfile:
dr = csv.DictReader(csvfile)
to_insert = []
for row in dr:
to_insert.append(float( row['capacity_factor'] if row['capacity_factor'] is not '' else 0))
if int(row['']) % 24 is 23:
to_test.append(to_insert + np.gradient(to_insert).tolist())
to_insert = []
result = kmeans.predict(normalize(to_test))
counts = clusters * [0]
for i in range(len(result)):
counts[result[i]] += 1
return counts
def get_unit_year(year, orispl_code, unitid):
return get_counts('cf/' + str(orispl_code) + '_' + str(unitid) + '_' + str(year) + '.csv')
In [14]:
code = 4941
for year in [2001, 2006, 2011, 2016]:
for entry in get_year(year):
print entry
In [33]:
print len(normalized)
In [124]:
code = 4941
unitid = 3
for year in [2001, 2006, 2011, 2016]:
for entry in get_unit_year(year, code, unitid):
print entry
In [20]:
def get_month_counts(filename):
to_test = {}
counts = {}
for month in range(1, 13):
to_test[month] = []
counts[month] = clusters * [0]
with open(filename, 'rb') as csvfile:
dr = csv.DictReader(csvfile)
to_insert = []
for row in dr:
to_insert.append(float(row['capacity_factor']))
if int(row['op_hour']) is 23:
month = int(row['op_date'][0:2])
to_test[month].append(to_insert + np.gradient(to_insert).tolist())
to_insert = []
for month in range(1, 13):
if to_test[month] == []: return counts
result = kmeans.predict(normalize(to_test[month]))
for i in range(len(result)):
counts[month][result[i]] += 1
return counts
In [21]:
dictionary = {}
for year in range(2002, 2017):
dictionary[year] = {}
for i, row in plant_labels.iterrows():
file_name = 'cf/' + str(row['orispl_code']) + '_' + str(row['unitid']) + '_' + str(year) + '.csv'
if not os.path.isfile(file_name): continue
counts = get_month_counts(file_name)
count = 0
for month in range(1, 13):
dictionary[year][month] = []
count += sum(counts[month])
if count is 0: continue
for month in range(1, 13):
percent = float(counts[month][6])/sum(counts[month])
dictionary[year][month].append(percent)
In [22]:
print month
print to_test
In [116]:
dictionary = {}
for year in range(2001, 2017):
dictionary[year] =[]
for i, row in plant_labels.iterrows():
file_name = 'cf/' + str(row['orispl_code']) + '_' + str(row['unitid']) + '_' + str(year) + '.csv'
if not os.path.isfile(file_name): continue
counts = get_counts(file_name)
if sum(counts) == 0: continue
percent = float(counts[6])/sum(counts)
if percent > 0.7 and year > 2010:
print year, row
dictionary[year].append(percent)
In [102]:
to_csv = {}
field_names = []
field_names.append("Year")
for min_percentage in range(10):
field_names.append(str(float(min_percentage) / 10) + " <= x < " + str(float(min_percentage + 1) / 10))
for year in dictionary:
to_csv[year] = {}
for min_percentage in range(10):
count = 0
for entry in dictionary[year]:
if entry >= float(min_percentage) / 10 and entry < float(min_percentage + 1) / 10:
count += 1
to_csv[year][field_names[min_percentage + 1]] = count
to_csv[year]['Year'] = year
with open('percent_baseload.csv', 'wb') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=field_names)
writer.writeheader()
for year in to_csv:
writer.writerow(to_csv[year])
In [2]:
year_labels = {}
for year in [2001, 2006, 2011, 2016]:
year_labels[year] = pandas.read_sql("select orispl_code, unitid, name, latitude, longitude, county, state, sum(CAST(gload as float)) as total_gen from data where year is " + str(year) + " group by orispl_code, unitid", con)
In [14]:
# Making a map visualization
import csv
fieldnames = ['Name', 'Unit ID', 'Latitude', 'Longitude', 'County', '% of days in baseload', 'Generation']
for year in [2001, 2006, 2011, 2016]:
with open('k-means/baseload_map_' + str(year) + '.csv', 'wb') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for i, row in year_labels[year].iterrows():
file_name = 'cf/' + str(row['orispl_code']) + '_' + str(row['unitid']) + '_' + str(year) + '.csv'
if not os.path.isfile(file_name): continue
counts = get_counts(file_name)
if sum(counts) == 0: continue
percent = float(counts[6])/sum(counts)
writer.writerow({'Name':row['name'], 'Unit ID':row['unitid'], 'Latitude':row['latitude'], 'Longitude': row['longitude'], 'County':row['county'] + ", " + row['state'], '% of days in baseload': percent, 'Generation': row['total_gen']})
In [143]:
In [ ]: