In [1]:
import sqlite3
import pandas
con = sqlite3.connect('coal.db')
labels = pandas.read_sql("select orispl_code, unitid from data group by orispl_code, unitid", con)
In [2]:
import matplotlib.pyplot as plt
import numpy as np
names = {}
for i, row in labels.iterrows():
key = (row['orispl_code'], row['unitid'])
for year in range(2001, 2017):
frame = pandas.read_csv('cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.csv')
if len(frame.name) is 0: continue
name = frame.name[0]
names[key] = name
# plt.gcf().clear()
# data = frame.capacity_factor
# plt.hist(data, bins=np.linspace(0, 1, 10))
# plt.xlabel('Capacity factor - hourly generation / max hourly generation')
# plt.ylabel('Frequency')
# plt.title('Capacity Factors at ' + name + ', unit ' + row['unitid'] + ' in ' + str(year))
# plt.savefig('cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.png')
In [129]:
data = {}
for i, row in labels.iterrows():
key = (names[(row['orispl_code'], row['unitid'])], row['unitid'])
data[key] = {}
for year in range(2001, 2017):
data[key][year] = {}
frame = pandas.read_csv('cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.csv')
col = frame.capacity_factor
dates = frame.op_date.unique()
for date in dates:
print date
col = col[(frame.op_date == date)]
if len(col) is 0: continue
data[key][year] = (np.average(col), np.std(col), np.std(col/np.average(col)), len(col), col)
print col
break
break
break
# for distinct date in date column:
# Get only the dates in that column! Revolutionary
# if len(col) is 0: continue
# col = col[(col != 0)] This line removes nonzero values from all columns
# data[key][year] = (np.average(col), np.std(col), np.std(col/np.average(col)), len(col), col)
In [120]:
import math
import scipy.stats as stats
def compare_years(year1, year2, alpha):
ttests = []
passes_t = []
ftests = []
passes_f = []
for unit in data:
disregard = False
if year1 not in data[unit]:
disregard = True
if year2 not in data[unit]:
disregard = True
if disregard: continue
min_year = data[unit][year1]
max_year = data[unit][year2]
# print(unit)
# print "Mean from year " + str(year1) + ": " + str(min_year[0])
# print "Std dev from year " + str(year1) + ": " + str(min_year[1])
# print "Mean from year " + str(year2) + ": " + str(max_year[0])
# print "Std dev from year " + str(year2) + ": " + str(max_year[1])
# print "T-test result: "
# print stats.ttest_ind(max_year[4], min_year[4], equal_var=False)
# print "Levene test result:"
# print stats.levene(max_year[4], min_year[4])
# plt.gcf().clear()
# plt.hist(min_year[4], bins=np.linspace(0, 1, 20))
# plt.xlabel('Capacity factor - hourly generation / max hourly generation')
# plt.ylabel('Frequency')
# plt.title('Capacity Factors at ' + str(unit) + ' in ' + str(year1))
# plt.show()
# plt.gcf().clear()
# plt.hist(max_year[4], bins=np.linspace(0, 1, 20))
# plt.xlabel('Capacity factor - hourly generation / max hourly generation')
# plt.ylabel('Frequency')
# plt.title('Capacity Factors at ' + str(unit) + ' in ' + str(year2))
# plt.show()
# break
if math.isnan(stats.ttest_ind(max_year[4], min_year[4], equal_var=False).pvalue):
continue
passes_t.append(stats.ttest_ind(max_year[4], min_year[4], equal_var=False).pvalue >= alpha)
ttests.append(stats.ttest_ind(max_year[4], min_year[4], equal_var=False).pvalue)
passes_f.append(stats.levene(max_year[4], min_year[4]).pvalue >= alpha)
ftests.append(stats.levene(max_year[4], min_year[4]).pvalue)
print ttests
plt.gcf().clear()
plt.hist(ttests, bins=np.linspace(0, 1, 20))
plt.xlabel('P value from t test')
plt.ylabel('Frequency')
plt.title('P value from t test between years ' + str(year1) + " and " + str(year2))
plt.show()
plt.gcf().clear()
plt.hist(passes_t)
plt.title('P value from t test >= ' + str(alpha))
plt.ylabel('Frequency')
plt.xlabel('True (1) or False (0)')
plt.show()
plt.gcf().clear()
plt.hist(ftests, bins=np.linspace(0, 1, 20))
plt.xlabel('P value from f test')
plt.ylabel('Frequency')
plt.title('P value from f test between years ' + str(year1) + " and " + str(year2))
plt.show()
plt.gcf().clear()
plt.hist(passes_f)
plt.title('P value from f test >= ' + str(alpha))
plt.ylabel('Frequency')
plt.xlabel('True (1) or False (0)')
plt.show()
In [115]:
compare_years(2001, 2002, .05)
In [117]:
compare_years(2001, 2002, .05)
In [121]:
compare_years(2001, 2002, .05)
In [ ]: