Residential Primary Energy Consumption, by Year and Fuel Type 2009

Following The Data Analysis in Python with Pandas Video https://www.youtube.com/watch?v=w26x-z-BdWQ And other resources


In [1]:
from pandas import *
#turn off html
#pandas.set_option(notebook_repr_html=False)
#pandas.set_printoptions(...) is deprecated. Instead, use pandas.set_option.
#figure this out later

#still broken
#pandas.set_option(max_rows=5000)
#also broken.

import pandas
pandas.__version__

#NnOTpandas.set_printoptions(notebook_repr_html=False)  # turn off html, i.e., without table.
#NOTpandas.set_printoptions(max_rows=500)   # getting the summary of the view versus getting everything. Terminal takes time to buffer.

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import pandas
pandas.__version__


Out[2]:
'0.14.1'

In [3]:
Res_Primary_E_Consumption = read_csv('ResPrimaryEnergyConsumptionBtu2.csv')
#Res_Primary_E_Consumption = read_csv('Res Primary Energy Consumption Btu.csv')

In [5]:
# Drop a column-- for previous file
#http://stackoverflow.com/questions/13411544/delete-column-from-pandas-dataframe
# DF = DF.drop('column_name', 1)
#Res_Primary_E_Consumption = Res_Primary_E_Consumption.drop('Unnamed: 11', 1)
# how to do this permanently? put it with the open file command?

In [6]:
#Res_Primary_E_Consumption.head(10)
#locking up. can't scroll.
# I hate technology.
# remove the html?

In [7]:
#Res_Primary_E_Consumption.tail(10)

In [8]:
# names
#why not showing the summary info as his first example did?
#names.head()
#names[names.year == 1880].tail()
#boolean array in numpy-- scalar operation.
#why not showing the summary info as his first example did?
#boys = names[names.sex == 'boy']
#girls = names[names.sex == 'girl']

In [4]:
#names.groupby(['sex','year']).size()
#names.groupby(['year','sex']).size().ix[2000]
Res_Primary_E_Consumption.groupby(['Year','NaturalGas']).size().ix[2000]
#a hierarchical index
# a series.


Out[4]:
NaturalGas
5.07          1
dtype: int64

In [5]:
#names.groupby(['sex','year']).size()
#names.groupby(['year','sex']).size().ix[2000]
Res_Primary_E_Consumption.groupby(['Year','NaturalGas']).size().ix[1980]
#a hierarchical index
# a series.


Out[5]:
NaturalGas
4.79          1
dtype: int64

In [6]:
#names.groupby(['sex','year']).size()
#names.groupby(['year','sex']).size().ix[2000]
Res_Primary_E_Consumption.groupby(['Year','NaturalGas_perc']).size().ix[1980]
#a hierarchical index
# a series. 

#this variable name works. and it seems to accept the highest percent 30%.
# but it seems to have a problem with the datatype later.


Out[6]:
NaturalGas_perc
30%                1
dtype: int64

In [109]:
#boys[boys.year== 2000].prop.idxmax()
# gives the index of the maximum value: idmax
#boys.ix[boys[boys.year== 2000].prop.idxmax()]
# for the whole row
Res_Primary_E_Consumption.ix[Res_Primary_E_Consumption[Res_Primary_E_Consumption.Year== 2000].TOTAL.idxmax()]


Out[109]:
Year                       2000
NaturalGas                 5.07
NaturalGas_perc             25%
Petroleum                  1.52
Petroleum_perc               7%
Coal                       0.01
Coal_perc                    0%
Renewable                  0.49
Renewable_perc               2%
Electricity_Sales          4.07
Electricity_Losses          9.2
Electricity_Total         13.27
Electricity_Total_perc      65%
TOTAL                     20.36
TOTAL_perc                 100%
Change_since2010              -
Name: 20, dtype: object

In [7]:
def get_max_record(group):
    return group.ix[group.TOTAL.idxmax()]
get_max_record(Res_Primary_E_Consumption)


Out[7]:
Year                       2035
NaturalGas                 4.76
NaturalGas_perc             20%
Petroleum                  0.87
Petroleum_perc               4%
Coal                       0.01
Coal_perc                    0%
Renewable                  0.55
Renewable_perc               2%
Electricity_Sales          5.94
Electricity_Losses        11.56
Electricity_Total          17.5
Electricity_Total_perc      74%
TOTAL                     23.69
TOTAL_perc                 100%
Change_since2010           0.3%
Name: 55, dtype: object

In [13]:


In [14]:
#Rename a column
# http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas 

# df.columns.values[0]='XX' 
#Res_Primary_E_Consumption.columns.values[1]='NaturalGas'

In [15]:
#Rename a column
# http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas 

# df.columns.values[0]='XX' 
#Res_Primary_E_Consumption.columns.values[-3]='MYTOTAL'

In [16]:
#Rename a column
# http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas 

# df.columns.values[0]='XX' 
#Res_Primary_E_Consumption.columns.values[2]='NaturalGas_perc'

In [17]:
#Rename a column
# http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas 

# df.columns.values[0]='XX' 
#Res_Primary_E_Consumption.columns.values[15]='Growth Rate(2010-Year)'

In [17]:


In [1]:
# split by year-- proportion in each year and stack them.
# 
#def get_max_record(group):
 #   return group.ix[group.prop.idxmax()]
#result =boys.groupby('year').apply(get_max_record)     
# my full data is not showing due to html, which I cannot remove.
# diversity of data going up.

def get_max_record(group):
    return group.ix[group.TOTAL.idxmax()]
result =Res_Primary_E_Consumption.groupby('Year').apply(get_max_record)   

#why am I getting this error? The TOTAL column has been renamed to YESTOTAL.
#AttributeError: 'DataFrame' object has no attribute 'YESTOTAL'
#YESTOTAL works beow with result.

#no error when returned to TOTAL.


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-dcd6aec99d2e> in <module>()
      9 def get_max_record(group):
     10     return group.ix[group.TOTAL.idxmax()]
---> 11 result =Res_Primary_E_Consumption.groupby('Year').apply(get_max_record)
     12 
     13 #why am I getting this error? The TOTAL column has been renamed to YESTOTAL.

NameError: name 'Res_Primary_E_Consumption' is not defined

In [9]:
# split by year-- proportion in each year and stack them.
# 
#def get_max_record(group):
 #   return group.ix[group.prop.idxmax()]
#result =boys.groupby('year').apply(get_max_record)     
# my full data is not showing due to html, which I cannot remove.
# diversity of data going up.

def get_max_record(group):
    return group.ix[group.NaturalGas.idxmax()]
NG_result =Res_Primary_E_Consumption.groupby('Year').apply(get_max_record)

In [21]:
# split by year-- proportion in each year and stack them.
# 
#def get_max_record(group):
 #   return group.ix[group.prop.idxmax()]
#result =boys.groupby('year').apply(get_max_record)     
# my full data is not showing due to html, which I cannot remove.
# diversity of data going up.

# NaturalGas
##def get_max_record(group):
  #  return group.ix[group.NaturalGas_perc.idxmax()]
##    return group.ix[group.Renewable_perc.idxmax()]
##NG_result =Res_Primary_E_Consumption.groupby('Year').apply(get_max_record)     

#Renewable_perc
#    return group.ix[group.NaturalGas.idxmax()]
# ??? AttributeError: 'DataFrame' object has no attribute 'NaturalGas'
# Why Not? It has TOTAL

#    return group.ix[group.NaturalGas_perc.idxmax()]
#ValueError: invalid literal for float(): 30%
# does it have this. but is it yeilding an error on this?

Total change over time


In [18]:
result.TOTAL


Out[18]:
Year
1980    15.72
1981    15.23
1982    15.48
1983    15.38
1984    15.90
1985    16.02
1986    15.94
1987    16.21
1988    17.12
1989    17.76
1990    16.91
1991    17.37
1992    17.31
1993    18.18
1994    18.07
1995    18.49
1996    19.47
1997    18.93
1998    18.92
1999    19.52
2000    20.36
2001    20.00
2002    20.74
2003    21.06
2004    21.05
2005    21.58
2006    20.64
2007    21.49
2008    21.53
2009    20.99
2010    22.07
2011    22.10
2012    21.77
2013    21.26
2014    21.15
2015    21.06
2016    21.13
2017    21.28
2018    21.41
2019    21.56
2020    21.66
2021    21.77
2022    21.89
2023    22.04
2024    22.19
2025    22.39
2026    22.51
2027    22.65
2028    22.78
2029    22.91
2030    23.06
2031    23.19
2032    23.30
2033    23.43
2034    23.56
2035    23.69
Name: TOTAL, Length: 56, dtype: float64

In [19]:
NG_result.NaturalGas


Out[19]:
Year
1980    4.79
1981    4.57
1982    4.68
1983    4.45
1984    4.64
1985    4.51
1986    4.38
1987    4.40
1988    4.72
1989    4.88
1990    4.47
1991    4.64
1992    4.78
1993    5.05
1994    4.94
1995    4.94
1996    5.35
1997    5.09
1998    4.64
1999    4.82
2000    5.07
2001    4.87
2002    4.98
2003    5.20
2004    4.97
2005    4.94
2006    4.46
2007    4.83
2008    5.00
2009    4.88
2010    5.06
2011    5.05
2012    5.17
2013    4.99
2014    5.00
2015    4.99
2016    4.98
2017    4.98
2018    4.96
2019    4.95
2020    4.95
2021    4.93
2022    4.92
2023    4.90
2024    4.89
2025    4.88
2026    4.87
2027    4.86
2028    4.85
2029    4.85
2030    4.84
2031    4.83
2032    4.81
2033    4.79
2034    4.78
2035    4.76
Name: NaturalGas, Length: 56, dtype: float64

In [13]:
# Regression Test-- Should be R^2= 1.00 since the independent variables add to TOTAL.
#df name: Res_Primary_E_Consumption
#http://allendowney.blogspot.com/
# from http://allendowney.blogspot.com/
#NaturalGas Petroleum TOTAL
import statsmodels.formula.api as smf
model = smf.ols('TOTAL ~ NaturalGas + Petroleum + Renewable + Electricity_Total', data=Res_Primary_E_Consumption)
results = model.fit()
results.summary()


Out[13]:
OLS Regression Results
Dep. Variable: TOTAL R-squared: 1.000
Model: OLS Adj. R-squared: 1.000
Method: Least Squares F-statistic: 1.510e+06
Date: Tue, 28 Oct 2014 Prob (F-statistic): 1.11e-128
Time: 13:21:09 Log-Likelihood: 196.34
No. Observations: 56 AIC: -382.7
Df Residuals: 51 BIC: -372.6
Df Model: 4
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 0.0365 0.041 0.897 0.374 -0.045 0.118
NaturalGas 0.9912 0.007 148.044 0.000 0.978 1.005
Petroleum 1.0144 0.010 106.295 0.000 0.995 1.034
Renewable 1.0358 0.012 88.663 0.000 1.012 1.059
Electricity_Total 0.9986 0.001 875.085 0.000 0.996 1.001
Omnibus: 1.606 Durbin-Watson: 1.869
Prob(Omnibus): 0.448 Jarque-Bera (JB): 1.333
Skew: 0.199 Prob(JB): 0.513
Kurtosis: 2.357 Cond. No. 607.

In [14]:
result.TOTAL.plot()
# linear increase since modeled after 2010.
# but hits later


Out[14]:
<matplotlib.axes.AxesSubplot at 0x7fdccd9ee710>

In [15]:
result.NaturalGas.plot()
# linear increase since modeled after 2010.
# but hits later


Out[15]:
<matplotlib.axes.AxesSubplot at 0x7fdcf0b46390>

In [16]:
result[:50]


Out[16]:
Year NaturalGas NaturalGas_perc Petroleum Petroleum_perc Coal Coal_perc Renewable Renewable_perc Electricity_Sales Electricity_Losses Electricity_Total Electricity_Total_perc TOTAL TOTAL_perc Change_since2010
Year
1980 1980 4.79 30% 1.72 11% 0.03 0% 0.85 5% 2.45 5.89 8.33 53% 15.72 100% -
1981 1981 4.57 30% 1.52 10% 0.03 0% 0.87 6% 2.46 5.77 8.24 54% 15.23 100% -
1982 1982 4.68 30% 1.42 9% 0.03 0% 0.97 6% 2.49 5.89 8.38 54% 15.48 100% -
1983 1983 4.45 29% 1.33 9% 0.03 0% 0.97 6% 2.56 6.03 8.59 56% 15.38 100% -
1984 1984 4.64 29% 1.51 10% 0.04 0% 0.98 6% 2.66 6.07 8.73 55% 15.90 100% -
1985 1985 4.51 28% 1.55 10% 0.04 0% 1.01 6% 2.71 6.21 8.92 56% 16.02 100% -
1986 1986 4.38 28% 1.52 10% 0.04 0% 0.92 6% 2.79 6.27 9.07 57% 15.94 100% -
1987 1987 4.40 27% 1.60 10% 0.04 0% 0.85 5% 2.90 6.42 9.32 58% 16.21 100% -
1988 1988 4.72 28% 1.66 10% 0.04 0% 0.91 5% 3.05 6.75 9.79 57% 17.12 100% -
1989 1989 4.88 28% 1.64 9% 0.03 0% 0.97 5% 3.09 7.13 10.22 58% 17.76 100% -
1990 1990 4.47 26% 1.37 8% 0.03 0% 0.64 4% 3.15 7.24 10.39 61% 16.91 100% -
1991 1991 4.64 27% 1.36 8% 0.03 0% 0.67 4% 3.26 7.42 10.68 61% 17.37 100% -
1992 1992 4.78 28% 1.39 8% 0.03 0% 0.70 4% 3.19 7.22 10.41 60% 17.31 100% -
1993 1993 5.05 28% 1.42 8% 0.03 0% 0.61 3% 3.39 7.68 11.08 61% 18.18 100% -
1994 1994 4.94 27% 1.39 8% 0.02 0% 0.58 3% 3.44 7.70 11.14 62% 18.07 100% -
1995 1995 4.94 27% 1.35 7% 0.02 0% 0.58 3% 3.56 8.03 11.59 63% 18.49 100% -
1996 1996 5.35 27% 1.46 7% 0.02 0% 0.60 3% 3.69 8.35 12.04 62% 19.47 100% -
1997 1997 5.09 27% 1.40 7% 0.02 0% 0.49 3% 3.67 8.27 11.94 63% 18.93 100% -
1998 1998 4.64 25% 1.28 7% 0.01 0% 0.44 2% 3.86 8.69 12.55 66% 18.92 100% -
1999 1999 4.82 25% 1.43 7% 0.01 0% 0.46 2% 3.91 8.88 12.79 66% 19.52 100% -
2000 2000 5.07 25% 1.52 7% 0.01 0% 0.49 2% 4.07 9.20 13.27 65% 20.36 100% -
2001 2001 4.87 24% 1.50 7% 0.01 0% 0.43 2% 4.10 9.08 13.18 66% 20.00 100% -
2002 2002 4.98 24% 1.43 7% 0.01 0% 0.44 2% 4.32 9.57 13.89 67% 20.74 100% -
2003 2003 5.20 25% 1.49 7% 0.01 0% 0.46 2% 4.35 9.55 13.90 66% 21.06 100% -
2004 2004 4.97 24% 1.49 7% 0.01 0% 0.47 2% 4.41 9.70 14.10 67% 21.05 100% -
2005 2005 4.94 23% 1.42 7% 0.01 0% 0.49 2% 4.64 10.08 14.72 68% 21.58 100% -
2006 2006 4.46 22% 1.20 6% 0.01 0% 0.45 2% 4.61 9.91 14.53 70% 20.64 100% -
2007 2007 4.83 22% 1.22 6% 0.01 0% 0.50 2% 4.75 10.19 14.94 69% 21.49 100% -
2008 2008 5.00 23% 1.21 6% 0.01 0% 0.53 2% 4.71 10.08 14.79 69% 21.53 100% -
2009 2009 4.88 23% 1.14 5% 0.01 0% 0.52 2% 4.66 9.80 14.45 69% 20.99 100% -
2010 2010 5.06 23% 1.22 6% 0.01 0% 0.45 2% 4.95 10.39 15.34 69% 22.07 100% -
2011 2011 5.05 23% 1.19 5% 0.01 0% 0.47 2% 4.94 10.45 15.39 70% 22.10 100% 0.1%
2012 2012 5.17 24% 1.19 5% 0.01 0% 0.50 2% 4.84 10.07 14.91 68% 21.77 100% -0.7%
2013 2013 4.99 23% 1.13 5% 0.01 0% 0.48 2% 4.79 9.86 14.66 69% 21.26 100% -1.2%
2014 2014 5.00 24% 1.11 5% 0.01 0% 0.49 2% 4.79 9.75 14.54 69% 21.15 100% -1.1%
2015 2015 4.99 24% 1.08 5% 0.01 0% 0.51 2% 4.79 9.68 14.47 69% 21.06 100% -0.9%
2016 2016 4.98 24% 1.06 5% 0.01 0% 0.53 2% 4.82 9.72 14.54 69% 21.13 100% -0.7%
2017 2017 4.98 23% 1.05 5% 0.01 0% 0.53 2% 4.87 9.85 14.72 69% 21.28 100% -0.5%
2018 2018 4.96 23% 1.03 5% 0.01 0% 0.53 2% 4.93 9.95 14.88 69% 21.41 100% -0.4%
2019 2019 4.95 23% 1.02 5% 0.01 0% 0.53 2% 4.98 10.06 15.05 70% 21.56 100% -0.3%
2020 2020 4.95 23% 1.01 5% 0.01 0% 0.54 2% 5.02 10.15 15.17 70% 21.66 100% -0.2%
2021 2021 4.93 23% 1.00 5% 0.01 0% 0.54 2% 5.07 10.22 15.29 70% 21.77 100% -0.1%
2022 2022 4.92 22% 0.99 5% 0.01 0% 0.54 2% 5.13 10.31 15.44 71% 21.89 100% -0.1%
2023 2023 4.90 22% 0.98 4% 0.01 0% 0.54 2% 5.18 10.44 15.62 71% 22.04 100% 0.0%
2024 2024 4.89 22% 0.97 4% 0.01 0% 0.54 2% 5.24 10.55 15.79 71% 22.19 100% 0.0%
2025 2025 4.88 22% 0.95 4% 0.01 0% 0.54 2% 5.30 10.70 16.00 71% 22.39 100% 0.1%
2026 2026 4.87 22% 0.95 4% 0.01 0% 0.54 2% 5.37 10.78 16.14 72% 22.51 100% 0.1%
2027 2027 4.86 21% 0.94 4% 0.01 0% 0.54 2% 5.43 10.88 16.31 72% 22.65 100% 0.2%
2028 2028 4.85 21% 0.93 4% 0.01 0% 0.54 2% 5.49 10.96 16.45 72% 22.78 100% 0.2%
2029 2029 4.85 21% 0.92 4% 0.01 0% 0.55 2% 5.56 11.03 16.59 72% 22.91 100% 0.2%

In [17]:
# df.[variablename].cumsum()

Res_Primary_E_Consumption.TOTAL.cumsum()[:130]
#first 130


Out[17]:
0       15.72
1       30.95
2       46.43
3       61.81
4       77.71
5       93.73
6      109.67
7      125.88
8      143.00
9      160.76
10     177.67
11     195.04
12     212.35
13     230.53
14     248.60
15     267.09
16     286.56
17     305.49
18     324.41
19     343.93
20     364.29
21     384.29
22     405.03
23     426.09
24     447.14
25     468.72
26     489.36
27     510.85
28     532.38
29     553.37
30     575.44
31     597.54
32     619.31
33     640.57
34     661.72
35     682.78
36     703.91
37     725.19
38     746.60
39     768.16
40     789.82
41     811.59
42     833.48
43     855.52
44     877.71
45     900.10
46     922.61
47     945.26
48     968.04
49     990.95
50    1014.01
51    1037.20
52    1060.50
53    1083.93
54    1107.49
55    1131.18
Name: TOTAL, Length: 56, dtype: float64

In [ ]:


In [20]:
TestData2 = read_csv('TestData2.csv')

In [21]:
# pandas join function is merge
#merge?
merge(Res_Primary_E_Consumption, TestData2, on= [ 'Year'])
#simple join on year 
#inner join is default


Out[21]:
Year NaturalGas NaturalGas_perc Petroleum Petroleum_perc Coal Coal_perc Renewable Renewable_perc Electricity_Sales Electricity_Losses Electricity_Total Electricity_Total_perc TOTAL TOTAL_perc Change_since2010 Col1 Col2 Col3
0 1980 4.79 30% 1.72 11% 0.03 0% 0.85 5% 2.45 5.89 8.33 53% 15.72 100% - 1 0.229849 1
1 1981 4.57 30% 1.52 10% 0.03 0% 0.87 6% 2.46 5.77 8.24 54% 15.23 100% - 2 0.003063 2
2 1982 4.68 30% 1.42 9% 0.03 0% 0.97 6% 2.49 5.89 8.38 54% 15.48 100% - 3 0.418250 1
3 1983 4.45 29% 1.33 9% 0.03 0% 0.97 6% 2.56 6.03 8.59 56% 15.38 100% - 4 0.689310 2
4 1984 4.64 29% 1.51 10% 0.04 0% 0.98 6% 2.66 6.07 8.73 55% 15.90 100% - 5 0.432489 3
5 1985 4.51 28% 1.55 10% 0.04 0% 1.01 6% 2.71 6.21 8.92 56% 16.02 100% - 6 0.987474 4
6 1986 4.38 28% 1.52 10% 0.04 0% 0.92 6% 2.79 6.27 9.07 57% 15.94 100% - 7 0.209367 5
7 1987 4.40 27% 1.60 10% 0.04 0% 0.85 5% 2.90 6.42 9.32 58% 16.21 100% - 8 0.582228 6
8 1988 4.72 28% 1.66 10% 0.04 0% 0.91 5% 3.05 6.75 9.79 57% 17.12 100% - 9 0.638592 7
9 1989 4.88 28% 1.64 9% 0.03 0% 0.97 5% 3.09 7.13 10.22 58% 17.76 100% - 10 0.286306 8
10 1990 4.47 26% 1.37 8% 0.03 0% 0.64 4% 3.15 7.24 10.39 61% 16.91 100% - 11 0.359775 9
11 1991 4.64 27% 1.36 8% 0.03 0% 0.67 4% 3.26 7.42 10.68 61% 17.37 100% - 12 0.130956 10
12 1992 4.78 28% 1.39 8% 0.03 0% 0.70 4% 3.19 7.22 10.41 60% 17.31 100% - 13 0.323788 11
13 1993 5.05 28% 1.42 8% 0.03 0% 0.61 3% 3.39 7.68 11.08 61% 18.18 100% - 14 0.053721 12
14 1994 4.94 27% 1.39 8% 0.02 0% 0.58 3% 3.44 7.70 11.14 62% 18.07 100% - 15 0.663227 13
15 1995 4.94 27% 1.35 7% 0.02 0% 0.58 3% 3.56 8.03 11.59 63% 18.49 100% - 16 0.633252 14
16 1996 5.35 27% 1.46 7% 0.02 0% 0.60 3% 3.69 8.35 12.04 62% 19.47 100% - 17 0.891918 15
17 1997 5.09 27% 1.40 7% 0.02 0% 0.49 3% 3.67 8.27 11.94 63% 18.93 100% - 18 0.495942 16
18 1998 4.64 25% 1.28 7% 0.01 0% 0.44 2% 3.86 8.69 12.55 66% 18.92 100% - 19 0.811847 17
19 1999 4.82 25% 1.43 7% 0.01 0% 0.46 2% 3.91 8.88 12.79 66% 19.52 100% - 20 0.609916 18
20 2000 5.07 25% 1.52 7% 0.01 0% 0.49 2% 4.07 9.20 13.27 65% 20.36 100% - 21 0.643762 19
21 2001 4.87 24% 1.50 7% 0.01 0% 0.43 2% 4.10 9.08 13.18 66% 20.00 100% - 22 0.505611 20
22 2002 4.98 24% 1.43 7% 0.01 0% 0.44 2% 4.32 9.57 13.89 67% 20.74 100% - 23 0.007572 21
23 2003 5.20 25% 1.49 7% 0.01 0% 0.46 2% 4.35 9.55 13.90 66% 21.06 100% - 24 0.336711 22
24 2004 4.97 24% 1.49 7% 0.01 0% 0.47 2% 4.41 9.70 14.10 67% 21.05 100% - 25 0.560270 23
25 2005 4.94 23% 1.42 7% 0.01 0% 0.49 2% 4.64 10.08 14.72 68% 21.58 100% - 26 0.925150 24
26 2006 4.46 22% 1.20 6% 0.01 0% 0.45 2% 4.61 9.91 14.53 70% 20.64 100% - 27 0.843730 25
27 2007 4.83 22% 1.22 6% 0.01 0% 0.50 2% 4.75 10.19 14.94 69% 21.49 100% - 28 0.308464 26
28 2008 5.00 23% 1.21 6% 0.01 0% 0.53 2% 4.71 10.08 14.79 69% 21.53 100% - 29 0.446437 27
29 2009 4.88 23% 1.14 5% 0.01 0% 0.52 2% 4.66 9.80 14.45 69% 20.99 100% - 30 0.866504 28
30 2010 5.06 23% 1.22 6% 0.01 0% 0.45 2% 4.95 10.39 15.34 69% 22.07 100% - 31 0.881132 29
31 2011 5.05 23% 1.19 5% 0.01 0% 0.47 2% 4.94 10.45 15.39 70% 22.10 100% 0.1% 32 0.347649 30
32 2012 5.17 24% 1.19 5% 0.01 0% 0.50 2% 4.84 10.07 14.91 68% 21.77 100% -0.7% 33 0.285232 31
33 2013 4.99 23% 1.13 5% 0.01 0% 0.48 2% 4.79 9.86 14.66 69% 21.26 100% -1.2% 34 0.429977 32
34 2014 5.00 24% 1.11 5% 0.01 0% 0.49 2% 4.79 9.75 14.54 69% 21.15 100% -1.1% 35 0.834912 33
35 2015 4.99 24% 1.08 5% 0.01 0% 0.51 2% 4.79 9.68 14.47 69% 21.06 100% -0.9% 36 0.851379 34
36 2016 4.98 24% 1.06 5% 0.01 0% 0.53 2% 4.82 9.72 14.54 69% 21.13 100% -0.7% 37 0.573889 35
37 2017 4.98 23% 1.05 5% 0.01 0% 0.53 2% 4.87 9.85 14.72 69% 21.28 100% -0.5% 38 0.035307 36
38 2018 4.96 23% 1.03 5% 0.01 0% 0.53 2% 4.93 9.95 14.88 69% 21.41 100% -0.4% 39 0.759066 37
39 2019 4.95 23% 1.02 5% 0.01 0% 0.53 2% 4.98 10.06 15.05 70% 21.56 100% -0.3% 40 0.211777 38
40 2020 4.95 23% 1.01 5% 0.01 0% 0.54 2% 5.02 10.15 15.17 70% 21.66 100% -0.2% 41 0.641565 39
41 2021 4.93 23% 1.00 5% 0.01 0% 0.54 2% 5.07 10.22 15.29 70% 21.77 100% -0.1% 42 0.537387 40
42 2022 4.92 22% 0.99 5% 0.01 0% 0.54 2% 5.13 10.31 15.44 71% 21.89 100% -0.1% 43 0.947203 41
43 2023 4.90 22% 0.98 4% 0.01 0% 0.54 2% 5.18 10.44 15.62 71% 22.04 100% 0.0% 44 0.878931 42
44 2024 4.89 22% 0.97 4% 0.01 0% 0.54 2% 5.24 10.55 15.79 71% 22.19 100% 0.0% 45 0.344067 43
45 2025 4.88 22% 0.95 4% 0.01 0% 0.54 2% 5.30 10.70 16.00 71% 22.39 100% 0.1% 46 0.559808 44
46 2026 4.87 22% 0.95 4% 0.01 0% 0.54 2% 5.37 10.78 16.14 72% 22.51 100% 0.1% 47 0.088891 45
47 2027 4.86 21% 0.94 4% 0.01 0% 0.54 2% 5.43 10.88 16.31 72% 22.65 100% 0.2% 48 0.551840 46
48 2028 4.85 21% 0.93 4% 0.01 0% 0.54 2% 5.49 10.96 16.45 72% 22.78 100% 0.2% 49 0.249141 47
49 2029 4.85 21% 0.92 4% 0.01 0% 0.55 2% 5.56 11.03 16.59 72% 22.91 100% 0.2% 50 0.014454 48
50 2030 4.84 21% 0.91 4% 0.01 0% 0.55 2% 5.63 11.12 16.76 73% 23.06 100% 0.2% 51 0.084236 49
51 2031 4.83 21% 0.90 4% 0.01 0% 0.55 2% 5.70 11.20 16.90 73% 23.19 100% 0.2% 52 0.301971 50
52 2032 4.81 21% 0.90 4% 0.01 0% 0.55 2% 5.76 11.28 17.04 73% 23.30 100% 0.2% 53 0.683238 51
53 2033 4.79 20% 0.89 4% 0.01 0% 0.55 2% 5.82 11.37 17.19 73% 23.43 100% 0.3% 54 0.841487 52
54 2034 4.78 20% 0.88 4% 0.01 0% 0.55 2% 5.88 11.47 17.35 74% 23.56 100% 0.3% 55 0.103193 53
55 2035 4.76 20% 0.87 4% 0.01 0% 0.55 2% 5.94 11.56 17.50 74% 23.69 100% 0.3% 56 0.127020 54
# note that many to many joins create the cartesian product of the two tables.

In [51]:
merged = merge(Res_Primary_E_Consumption, TestData2, on= ['Year'])

In [72]:
from __future__ import division

In [93]:
merged['NG'] = np.float32(merged.NaturalGas)

In [95]:
merged['TTL']  = np.float32(merged.TOTAL)

In [101]:
merged['RNW']  = np.float32(merged.Renewable)

In [101]:
merged['RNW']  = np.float32(merged.Renewable)

In [98]:
# Create a new proportion variable
merged['NG_percent'] = (merged.NG / merged.TTL)

In [106]:
#Create a new variable
merged['RNW_percent'] = (merged.RNW / merged.TTL)

In [23]:
TestData3 = read_csv('TestData3_missingsomeyears.csv')

In [24]:
# pandas join function is merge
#merge?
merge(Res_Primary_E_Consumption, TestData3, on= [ 'Year'])
#simple join on year 
#inner join is default
# check for merge of missing data-- should have all and accept some blank data

### But no-- it just takes those columns that match.

# need to find the pandas command for those that


Out[24]:
Year NaturalGas NaturalGas_perc Petroleum Petroleum_perc Coal Coal_perc Renewable Renewable_perc Electricity_Sales Electricity_Losses Electricity_Total Electricity_Total_perc TOTAL TOTAL_perc Change_since2010 Col1 Col2 Col3
0 1980 4.79 30% 1.72 11% 0.03 0% 0.85 5% 2.45 5.89 8.33 53% 15.72 100% - 1 0.229849 1
1 1981 4.57 30% 1.52 10% 0.03 0% 0.87 6% 2.46 5.77 8.24 54% 15.23 100% - 2 0.003063 2
2 1982 4.68 30% 1.42 9% 0.03 0% 0.97 6% 2.49 5.89 8.38 54% 15.48 100% - 3 0.418250 1
3 1983 4.45 29% 1.33 9% 0.03 0% 0.97 6% 2.56 6.03 8.59 56% 15.38 100% - 4 0.689310 2
4 1984 4.64 29% 1.51 10% 0.04 0% 0.98 6% 2.66 6.07 8.73 55% 15.90 100% - 5 0.432489 3
5 1985 4.51 28% 1.55 10% 0.04 0% 1.01 6% 2.71 6.21 8.92 56% 16.02 100% - 6 0.987474 4
6 1986 4.38 28% 1.52 10% 0.04 0% 0.92 6% 2.79 6.27 9.07 57% 15.94 100% - 7 0.209367 5
7 1987 4.40 27% 1.60 10% 0.04 0% 0.85 5% 2.90 6.42 9.32 58% 16.21 100% - 8 0.582228 6
8 1988 4.72 28% 1.66 10% 0.04 0% 0.91 5% 3.05 6.75 9.79 57% 17.12 100% - 9 0.638592 7
9 1989 4.88 28% 1.64 9% 0.03 0% 0.97 5% 3.09 7.13 10.22 58% 17.76 100% - 10 0.286306 8
10 1990 4.47 26% 1.37 8% 0.03 0% 0.64 4% 3.15 7.24 10.39 61% 16.91 100% - 11 0.359775 9
11 1991 4.64 27% 1.36 8% 0.03 0% 0.67 4% 3.26 7.42 10.68 61% 17.37 100% - 12 0.130956 10
12 1992 4.78 28% 1.39 8% 0.03 0% 0.70 4% 3.19 7.22 10.41 60% 17.31 100% - 13 0.323788 11
13 1993 5.05 28% 1.42 8% 0.03 0% 0.61 3% 3.39 7.68 11.08 61% 18.18 100% - 14 0.053721 12
14 1994 4.94 27% 1.39 8% 0.02 0% 0.58 3% 3.44 7.70 11.14 62% 18.07 100% - 15 0.663227 13
15 1995 4.94 27% 1.35 7% 0.02 0% 0.58 3% 3.56 8.03 11.59 63% 18.49 100% - 16 0.633252 14
16 1996 5.35 27% 1.46 7% 0.02 0% 0.60 3% 3.69 8.35 12.04 62% 19.47 100% - 17 0.891918 15
17 1997 5.09 27% 1.40 7% 0.02 0% 0.49 3% 3.67 8.27 11.94 63% 18.93 100% - 18 0.495942 16
18 1998 4.64 25% 1.28 7% 0.01 0% 0.44 2% 3.86 8.69 12.55 66% 18.92 100% - 19 0.811847 17
19 1999 4.82 25% 1.43 7% 0.01 0% 0.46 2% 3.91 8.88 12.79 66% 19.52 100% - 20 0.609916 18
20 2000 5.07 25% 1.52 7% 0.01 0% 0.49 2% 4.07 9.20 13.27 65% 20.36 100% - 21 0.643762 19
21 2001 4.87 24% 1.50 7% 0.01 0% 0.43 2% 4.10 9.08 13.18 66% 20.00 100% - 22 0.505611 20
22 2002 4.98 24% 1.43 7% 0.01 0% 0.44 2% 4.32 9.57 13.89 67% 20.74 100% - 23 0.007572 21
23 2003 5.20 25% 1.49 7% 0.01 0% 0.46 2% 4.35 9.55 13.90 66% 21.06 100% - 24 0.336711 22
24 2004 4.97 24% 1.49 7% 0.01 0% 0.47 2% 4.41 9.70 14.10 67% 21.05 100% - 25 0.560270 23
25 2005 4.94 23% 1.42 7% 0.01 0% 0.49 2% 4.64 10.08 14.72 68% 21.58 100% - 26 0.925150 24
26 2006 4.46 22% 1.20 6% 0.01 0% 0.45 2% 4.61 9.91 14.53 70% 20.64 100% - 27 0.843730 25
27 2007 4.83 22% 1.22 6% 0.01 0% 0.50 2% 4.75 10.19 14.94 69% 21.49 100% - 28 0.308464 26
28 2008 5.00 23% 1.21 6% 0.01 0% 0.53 2% 4.71 10.08 14.79 69% 21.53 100% - 29 0.446437 27
29 2009 4.88 23% 1.14 5% 0.01 0% 0.52 2% 4.66 9.80 14.45 69% 20.99 100% - 30 0.866504 28

In [25]:
merged_data3 = merge(Res_Primary_E_Consumption, TestData3, on= ['Year'])

In [26]:
merged_data3.tail(10)


Out[26]:
Year NaturalGas NaturalGas_perc Petroleum Petroleum_perc Coal Coal_perc Renewable Renewable_perc Electricity_Sales Electricity_Losses Electricity_Total Electricity_Total_perc TOTAL TOTAL_perc Change_since2010 Col1 Col2 Col3
20 2000 5.07 25% 1.52 7% 0.01 0% 0.49 2% 4.07 9.20 13.27 65% 20.36 100% - 21 0.643762 19
21 2001 4.87 24% 1.50 7% 0.01 0% 0.43 2% 4.10 9.08 13.18 66% 20.00 100% - 22 0.505611 20
22 2002 4.98 24% 1.43 7% 0.01 0% 0.44 2% 4.32 9.57 13.89 67% 20.74 100% - 23 0.007572 21
23 2003 5.20 25% 1.49 7% 0.01 0% 0.46 2% 4.35 9.55 13.90 66% 21.06 100% - 24 0.336711 22
24 2004 4.97 24% 1.49 7% 0.01 0% 0.47 2% 4.41 9.70 14.10 67% 21.05 100% - 25 0.560270 23
25 2005 4.94 23% 1.42 7% 0.01 0% 0.49 2% 4.64 10.08 14.72 68% 21.58 100% - 26 0.925150 24
26 2006 4.46 22% 1.20 6% 0.01 0% 0.45 2% 4.61 9.91 14.53 70% 20.64 100% - 27 0.843730 25
27 2007 4.83 22% 1.22 6% 0.01 0% 0.50 2% 4.75 10.19 14.94 69% 21.49 100% - 28 0.308464 26
28 2008 5.00 23% 1.21 6% 0.01 0% 0.53 2% 4.71 10.08 14.79 69% 21.53 100% - 29 0.446437 27
29 2009 4.88 23% 1.14 5% 0.01 0% 0.52 2% 4.66 9.80 14.45 69% 20.99 100% - 30 0.866504 28

In [53]:
# total test.. Create a new column called TOTALtest,
# need to add the number of persons
merged['TOTALtest'] = np.floor(merged.TOTAL * merged.Col2)

#decimal numbers so use npl.floor

In [ ]:
#TRY other merge options.
# pandas join function is merge
#merge?

# DOes the order matter? NO
#merge(TestData3, Res_Primary_E_Consumption, on= [ 'Year'])
#simple join on year 
#inner join is default
# it just takes those columns that match.
# inner is union?

# need to find the pandas command for those that 

#how to right outer and lerft outer joins in pandas?

In [99]:
merged.head()


Out[99]:
Year NaturalGas NaturalGas_perc Petroleum Petroleum_perc Coal Coal_perc Renewable Renewable_perc Electricity_Sales ... TOTAL TOTAL_perc Change_since2010 Col1 Col2 Col3 TOTALtest NG_percent NG TTL
0 1980 4.79 30% 1.72 11% 0.03 0% 0.85 5% 2.45 ... 15.72 100% - 1 0.229849 1 75 0.304707 4.79 15.72
1 1981 4.57 30% 1.52 10% 0.03 0% 0.87 6% 2.46 ... 15.23 100% - 2 0.003063 2 69 0.300066 4.57 15.23
2 1982 4.68 30% 1.42 9% 0.03 0% 0.97 6% 2.49 ... 15.48 100% - 3 0.418250 1 72 0.302326 4.68 15.48
3 1983 4.45 29% 1.33 9% 0.03 0% 0.97 6% 2.56 ... 15.38 100% - 4 0.689310 2 68 0.289337 4.45 15.38
4 1984 4.64 29% 1.51 10% 0.04 0% 0.98 6% 2.66 ... 15.90 100% - 5 0.432489 3 73 0.291824 4.64 15.90

5 rows × 23 columns


In [ ]:
# total test.. Create a new column called TOTALtest,
# need to add the number of persons
merged['TOTALtest'] = np.floor(merged.TOTAL * merged.Col2)

#decimal numbers so use npl.floor

In [57]:
merged.groupby(['Year', 'TOTALtest'])['Col3'].sum()
#for each year TOTALtest combo-- the total number of Col3.
#hierarchical labeling.


Out[57]:
Year  TOTALtest
1980  3             1
1981  0             2
1982  6             1
1983  10            2
1984  6             3
1985  15            4
1986  3             5
1987  9             6
1988  10            7
1989  5             8
1990  6             9
1991  2            10
1992  5            11
1993  0            12
1994  11           13
1995  11           14
1996  17           15
1997  9            16
1998  15           17
1999  11           18
2000  13           19
2001  10           20
2002  0            21
2003  7            22
2004  11           23
2005  19           24
2006  17           25
2007  6            26
2008  9            27
2009  18           28
2010  19           29
2011  7            30
2012  6            31
2013  9            32
2014  17           33
2015  17           34
2016  12           35
2017  0            36
2018  16           37
2019  4            38
2020  13           39
2021  11           40
2022  20           41
2023  19           42
2024  7            43
2025  12           44
2026  2            45
2027  12           46
2028  5            47
2029  0            48
2030  1            49
2031  7            50
2032  15           51
2033  19           52
2034  2            53
2035  3            54
Name: Col3, Length: 56, dtype: int64

In [142]:
# Regression Test-- 
#made up independent variable.
import statsmodels.formula.api as smf
model = smf.ols('merged.TOTAL ~ merged.Col2', data=merged)
results_merged = model.fit()
results_merged.summary()


Out[142]:
OLS Regression Results
Dep. Variable: merged.TOTAL R-squared: 0.000
Model: OLS Adj. R-squared: -0.018
Method: Least Squares F-statistic: 0.008511
Date: Sun, 26 Oct 2014 Prob (F-statistic): 0.927
Time: 15:35:14 Log-Likelihood: -130.76
No. Observations: 56 AIC: 265.5
Df Residuals: 54 BIC: 269.6
Df Model: 1
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 20.1465 0.669 30.101 0.000 18.805 21.488
merged.Col2 0.1100 1.193 0.092 0.927 -2.282 2.502
Omnibus: 7.837 Durbin-Watson: 0.032
Prob(Omnibus): 0.020 Jarque-Bera (JB): 5.402
Skew: -0.616 Prob(JB): 0.0671
Kurtosis: 2.107 Cond. No. 4.38

In [ ]:


In [84]:
#result = _   # underscore in ipython: the output of the last statement, because we don't want to compute again the same thing.

In [108]:
#result['Change_since2010'].plot()
#result['TTL'].plot()

In [140]:
#import numpy as np
#import matplotlib.pyplot as plt

# evenly sampled time at 200ms intervals
#t = np.arange(0., 5., 0.2)

# red dashes, blue squares and green triangles
#plt.plot(merged.Year, merged.TOTAL, 'r--')
#plt.show()

plt.plot(merged.Year, merged.TOTAL, 'rs', merged.Year,merged.NaturalGas, 'b--', merged.Year,merged.Coal, 'k--',merged.Year,merged.Renewable, 'g--',
         merged.Year,merged.Electricity_Total, 'm--')
plt.ylabel('MMBtu')
plt.xlabel('Year')
plt.title('US Residential Energy Use (MMBtu)',fontsize=16)
plt.show()

#result['merged.TOTAL',
#'NaturalGas',
#'Petroleum',
#'Coal',
#'Renewable',
#'Electricity_Total'].plot()



In [96]:
mgirls = merge(girls, births)  # inner join by default.

In [97]:
mgirls['persons'] = np.floor(mgirls.prop * mgirls.births)

In [98]:
gpersons = mgirls.set_index(['year', 'name']).persons   # Select out persons

In [144]:
type(merged.TTL)  # One dimensional series.


Out[144]:
pandas.core.series.Series

In [145]:
result = _   # underscore in ipython: the output of the last statement, because we don't want to compute again the same thing.

In [146]:
#pandas.DataFrame.mean
#DataFrame.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)
#Return the mean of the values for the requested axis
merged.TTL.mean()


Out[146]:
20.199642811502731

In [33]:
#pandas.DataFrame.mean
#DataFrame.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)
#Return the mean of the values for the requested axis
Res_Primary_E_Consumption.mean()


Out[33]:
Year                  2007.500000
NaturalGas               4.853214
Petroleum                1.232321
Coal                     0.016607
Renewable                0.599643
Electricity_Sales        4.341429
Electricity_Losses       9.158393
Electricity_Total       13.500179
TOTAL                   20.199643
dtype: float64

In [34]:
#pandas.DataFrame.mean
#DataFrame.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)
#Return the mean of the values for the requested axis
Res_Primary_E_Consumption.TOTAL.mean()


Out[34]:
20.199642857142855
pandas.DataFrame.mean DataFrame.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) Return the mean of the values for the requested axis Parameters : axis : {index (0), columns (1)} skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a Series numeric_only : boolean, default None Include only float, int, boolean data. If None, will attempt to use everything, then use only numeric data Returns : mean : Series or DataFrame (if level specified) Res_Primary_E_Consumption.TOTAL.mean(axis=None, skipna=None, level=None, numeric_only=None)

In [35]:
Res_Primary_E_Consumption.TOTAL.mean(axis=None, skipna=None, level=None, numeric_only=None)


Out[35]:
20.199642857142855

In [ ]: