In [1]:
import sqlite3
import pandas
con = sqlite3.connect('coal.db')
labels = pandas.read_sql("select orispl_code, unitid from data group by orispl_code, unitid", con)
In [2]:
import matplotlib.pyplot as plt
import numpy as np
names = {}
for i, row in labels.iterrows():
key = (row['orispl_code'], row['unitid'])
for year in range(2001, 2017):
frame = pandas.read_csv('cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.csv')
if len(frame.name) is 0: continue
name = frame.name[0]
names[key] = name
# plt.gcf().clear()
# data = frame.capacity_factor
# plt.hist(data, bins=np.linspace(0, 1, 10))
# plt.xlabel('Capacity factor - hourly generation / max hourly generation')
# plt.ylabel('Frequency')
# plt.title('Capacity Factors at ' + name + ', unit ' + row['unitid'] + ' in ' + str(year))
# plt.savefig('cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.png')
In [13]:
data = {}
for i, row in labels.iterrows():
key = (names[(row['orispl_code'], row['unitid'])], row['unitid'])
data[key] = {}
for year in range(2001, 2017):
data[key][year] = []
frame = pandas.read_csv('cf/' + str(row['orispl_code']) + '_' + row['unitid'] + '_' + str(year) + '.csv')
cfs = frame.capacity_factor
dates = frame.op_date.unique()
for date in dates:
col = cfs[(frame.op_date == date)]
if len(col) is 0: continue
data[key][year].append((np.average(col), np.std(col), np.std(col/np.average(col)), len(col), col))
# col = col[(col != 0)] This line removes nonzero values from all columns
In [32]:
import math
import scipy.stats as stats
def compare_years(year1, year2, alpha = .05):
ttests = []
passes_t = []
ftests = []
passes_f = []
passes_list = []
for unit in data:
disregard = False
if year1 not in data[unit]:
disregard = True
if year2 not in data[unit]:
disregard = True
if disregard: continue
min_year = data[unit][year1]
max_year = data[unit][year2]
averages_min = []
stds_min = []
for day in min_year:
averages_min.append(day[0])
stds_min.append(day[1])
averages_max = []
stds_max = []
for day in max_year:
averages_max.append(day[0])
stds_max.append(day[1])
mean_ttest = stats.ttest_ind(averages_max, averages_min, equal_var=False).pvalue
std_ttest = stats.ttest_ind(stds_max, stds_min, equal_var=False).pvalue
# plt.gcf().clear()
# plt.hist(stds_min, bins=np.linspace(0, 1, 20))
# plt.xlabel('Std of hourly capacity factor over a day')
# plt.ylabel('Frequency')
# plt.title('Standard Deviation of Capacity Factor at ' + str(unit) + ' in ' + str(year1))
# plt.show()
# plt.gcf().clear()
# plt.hist(stds_max, bins=np.linspace(0, 1, 20))
# plt.xlabel('Std of hourly capacity factor over a day')
# plt.ylabel('Frequency')
# plt.title('Standard Deviation of Capacity Factor at ' + str(unit) + ' in ' + str(year2))
# plt.show()
# break
if math.isnan(mean_ttest) or math.isnan(std_ttest):
continue
passes_t.append(mean_ttest >= alpha)
ttests.append(mean_ttest)
passes_f.append(std_ttest >= alpha)
if (std_ttest >= alpha): passes_list.append(unit)
ftests.append(std_ttest)
# print ttests
plt.gcf().clear()
plt.hist(ttests, bins=np.linspace(0, 1, 20))
plt.xlabel('P value from t test')
plt.ylabel('Frequency')
plt.title('P value from t test between years ' + str(year1) + " and " + str(year2))
plt.show()
plt.gcf().clear()
plt.hist(passes_t)
plt.title('P value from t test >= ' + str(alpha))
plt.ylabel('Frequency')
plt.xlabel('True (1) or False (0)')
plt.show()
plt.gcf().clear()
plt.hist(ftests, bins=np.linspace(0, 1, 20))
plt.xlabel('P value from f test')
plt.ylabel('Frequency')
plt.title('P value from f test between years ' + str(year1) + " and " + str(year2))
plt.show()
plt.gcf().clear()
plt.hist(passes_f)
plt.title('P value from f test >= ' + str(alpha))
plt.ylabel('Frequency')
plt.xlabel('True (1) or False (0)')
plt.show()
print passes_list
# compare_years(2001, 2011)
In [38]:
compare_years(2001, 2012, .0001)
In [ ]: