Following The Data Analysis in Python with Pandas Video https://www.youtube.com/watch?v=w26x-z-BdWQ And other resources
In [1]:
from pandas import *
#turn off html
#pandas.set_option(notebook_repr_html=False)
#pandas.set_printoptions(...) is deprecated. Instead, use pandas.set_option.
#figure this out later
#still broken
#pandas.set_option(max_rows=5000)
#also broken.
import pandas
pandas.__version__
#NnOTpandas.set_printoptions(notebook_repr_html=False) # turn off html, i.e., without table.
#NOTpandas.set_printoptions(max_rows=500) # getting the summary of the view versus getting everything. Terminal takes time to buffer.
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
import pandas
pandas.__version__
Out[2]:
In [3]:
Res_Primary_E_Consumption = read_csv('ResPrimaryEnergyConsumptionBtu2.csv')
#Res_Primary_E_Consumption = read_csv('Res Primary Energy Consumption Btu.csv')
In [5]:
# Drop a column-- for previous file
#http://stackoverflow.com/questions/13411544/delete-column-from-pandas-dataframe
# DF = DF.drop('column_name', 1)
#Res_Primary_E_Consumption = Res_Primary_E_Consumption.drop('Unnamed: 11', 1)
# how to do this permanently? put it with the open file command?
In [6]:
#Res_Primary_E_Consumption.head(10)
#locking up. can't scroll.
# I hate technology.
# remove the html?
In [7]:
#Res_Primary_E_Consumption.tail(10)
In [8]:
# names
#why not showing the summary info as his first example did?
#names.head()
#names[names.year == 1880].tail()
#boolean array in numpy-- scalar operation.
#why not showing the summary info as his first example did?
#boys = names[names.sex == 'boy']
#girls = names[names.sex == 'girl']
In [4]:
#names.groupby(['sex','year']).size()
#names.groupby(['year','sex']).size().ix[2000]
Res_Primary_E_Consumption.groupby(['Year','NaturalGas']).size().ix[2000]
#a hierarchical index
# a series.
Out[4]:
In [5]:
#names.groupby(['sex','year']).size()
#names.groupby(['year','sex']).size().ix[2000]
Res_Primary_E_Consumption.groupby(['Year','NaturalGas']).size().ix[1980]
#a hierarchical index
# a series.
Out[5]:
In [6]:
#names.groupby(['sex','year']).size()
#names.groupby(['year','sex']).size().ix[2000]
Res_Primary_E_Consumption.groupby(['Year','NaturalGas_perc']).size().ix[1980]
#a hierarchical index
# a series.
#this variable name works. and it seems to accept the highest percent 30%.
# but it seems to have a problem with the datatype later.
Out[6]:
In [109]:
#boys[boys.year== 2000].prop.idxmax()
# gives the index of the maximum value: idmax
#boys.ix[boys[boys.year== 2000].prop.idxmax()]
# for the whole row
Res_Primary_E_Consumption.ix[Res_Primary_E_Consumption[Res_Primary_E_Consumption.Year== 2000].TOTAL.idxmax()]
Out[109]:
In [7]:
def get_max_record(group):
return group.ix[group.TOTAL.idxmax()]
get_max_record(Res_Primary_E_Consumption)
Out[7]:
In [13]:
In [14]:
#Rename a column
# http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
# df.columns.values[0]='XX'
#Res_Primary_E_Consumption.columns.values[1]='NaturalGas'
In [15]:
#Rename a column
# http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
# df.columns.values[0]='XX'
#Res_Primary_E_Consumption.columns.values[-3]='MYTOTAL'
In [16]:
#Rename a column
# http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
# df.columns.values[0]='XX'
#Res_Primary_E_Consumption.columns.values[2]='NaturalGas_perc'
In [17]:
#Rename a column
# http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
# df.columns.values[0]='XX'
#Res_Primary_E_Consumption.columns.values[15]='Growth Rate(2010-Year)'
In [17]:
In [1]:
# split by year-- proportion in each year and stack them.
#
#def get_max_record(group):
# return group.ix[group.prop.idxmax()]
#result =boys.groupby('year').apply(get_max_record)
# my full data is not showing due to html, which I cannot remove.
# diversity of data going up.
def get_max_record(group):
return group.ix[group.TOTAL.idxmax()]
result =Res_Primary_E_Consumption.groupby('Year').apply(get_max_record)
#why am I getting this error? The TOTAL column has been renamed to YESTOTAL.
#AttributeError: 'DataFrame' object has no attribute 'YESTOTAL'
#YESTOTAL works beow with result.
#no error when returned to TOTAL.
In [9]:
# split by year-- proportion in each year and stack them.
#
#def get_max_record(group):
# return group.ix[group.prop.idxmax()]
#result =boys.groupby('year').apply(get_max_record)
# my full data is not showing due to html, which I cannot remove.
# diversity of data going up.
def get_max_record(group):
return group.ix[group.NaturalGas.idxmax()]
NG_result =Res_Primary_E_Consumption.groupby('Year').apply(get_max_record)
In [21]:
# split by year-- proportion in each year and stack them.
#
#def get_max_record(group):
# return group.ix[group.prop.idxmax()]
#result =boys.groupby('year').apply(get_max_record)
# my full data is not showing due to html, which I cannot remove.
# diversity of data going up.
# NaturalGas
##def get_max_record(group):
# return group.ix[group.NaturalGas_perc.idxmax()]
## return group.ix[group.Renewable_perc.idxmax()]
##NG_result =Res_Primary_E_Consumption.groupby('Year').apply(get_max_record)
#Renewable_perc
# return group.ix[group.NaturalGas.idxmax()]
# ??? AttributeError: 'DataFrame' object has no attribute 'NaturalGas'
# Why Not? It has TOTAL
# return group.ix[group.NaturalGas_perc.idxmax()]
#ValueError: invalid literal for float(): 30%
# does it have this. but is it yeilding an error on this?
In [18]:
result.TOTAL
Out[18]:
In [19]:
NG_result.NaturalGas
Out[19]:
In [13]:
# Regression Test-- Should be R^2= 1.00 since the independent variables add to TOTAL.
#df name: Res_Primary_E_Consumption
#http://allendowney.blogspot.com/
# from http://allendowney.blogspot.com/
#NaturalGas Petroleum TOTAL
import statsmodels.formula.api as smf
model = smf.ols('TOTAL ~ NaturalGas + Petroleum + Renewable + Electricity_Total', data=Res_Primary_E_Consumption)
results = model.fit()
results.summary()
Out[13]:
In [14]:
result.TOTAL.plot()
# linear increase since modeled after 2010.
# but hits later
Out[14]:
In [15]:
result.NaturalGas.plot()
# linear increase since modeled after 2010.
# but hits later
Out[15]:
In [16]:
result[:50]
Out[16]:
In [17]:
# df.[variablename].cumsum()
Res_Primary_E_Consumption.TOTAL.cumsum()[:130]
#first 130
Out[17]:
In [ ]:
In [20]:
TestData2 = read_csv('TestData2.csv')
In [21]:
# pandas join function is merge
#merge?
merge(Res_Primary_E_Consumption, TestData2, on= [ 'Year'])
#simple join on year
#inner join is default
Out[21]:
In [51]:
merged = merge(Res_Primary_E_Consumption, TestData2, on= ['Year'])
In [72]:
from __future__ import division
In [93]:
merged['NG'] = np.float32(merged.NaturalGas)
In [95]:
merged['TTL'] = np.float32(merged.TOTAL)
In [101]:
merged['RNW'] = np.float32(merged.Renewable)
In [101]:
merged['RNW'] = np.float32(merged.Renewable)
In [98]:
# Create a new proportion variable
merged['NG_percent'] = (merged.NG / merged.TTL)
In [106]:
#Create a new variable
merged['RNW_percent'] = (merged.RNW / merged.TTL)
In [23]:
TestData3 = read_csv('TestData3_missingsomeyears.csv')
In [24]:
# pandas join function is merge
#merge?
merge(Res_Primary_E_Consumption, TestData3, on= [ 'Year'])
#simple join on year
#inner join is default
# check for merge of missing data-- should have all and accept some blank data
### But no-- it just takes those columns that match.
# need to find the pandas command for those that
Out[24]:
In [25]:
merged_data3 = merge(Res_Primary_E_Consumption, TestData3, on= ['Year'])
In [26]:
merged_data3.tail(10)
Out[26]:
In [53]:
# total test.. Create a new column called TOTALtest,
# need to add the number of persons
merged['TOTALtest'] = np.floor(merged.TOTAL * merged.Col2)
#decimal numbers so use npl.floor
In [ ]:
#TRY other merge options.
# pandas join function is merge
#merge?
# DOes the order matter? NO
#merge(TestData3, Res_Primary_E_Consumption, on= [ 'Year'])
#simple join on year
#inner join is default
# it just takes those columns that match.
# inner is union?
# need to find the pandas command for those that
#how to right outer and lerft outer joins in pandas?
In [99]:
merged.head()
Out[99]:
In [ ]:
# total test.. Create a new column called TOTALtest,
# need to add the number of persons
merged['TOTALtest'] = np.floor(merged.TOTAL * merged.Col2)
#decimal numbers so use npl.floor
In [57]:
merged.groupby(['Year', 'TOTALtest'])['Col3'].sum()
#for each year TOTALtest combo-- the total number of Col3.
#hierarchical labeling.
Out[57]:
In [142]:
# Regression Test--
#made up independent variable.
import statsmodels.formula.api as smf
model = smf.ols('merged.TOTAL ~ merged.Col2', data=merged)
results_merged = model.fit()
results_merged.summary()
Out[142]:
In [ ]:
In [84]:
#result = _ # underscore in ipython: the output of the last statement, because we don't want to compute again the same thing.
In [108]:
#result['Change_since2010'].plot()
#result['TTL'].plot()
In [140]:
#import numpy as np
#import matplotlib.pyplot as plt
# evenly sampled time at 200ms intervals
#t = np.arange(0., 5., 0.2)
# red dashes, blue squares and green triangles
#plt.plot(merged.Year, merged.TOTAL, 'r--')
#plt.show()
plt.plot(merged.Year, merged.TOTAL, 'rs', merged.Year,merged.NaturalGas, 'b--', merged.Year,merged.Coal, 'k--',merged.Year,merged.Renewable, 'g--',
merged.Year,merged.Electricity_Total, 'm--')
plt.ylabel('MMBtu')
plt.xlabel('Year')
plt.title('US Residential Energy Use (MMBtu)',fontsize=16)
plt.show()
#result['merged.TOTAL',
#'NaturalGas',
#'Petroleum',
#'Coal',
#'Renewable',
#'Electricity_Total'].plot()
In [96]:
mgirls = merge(girls, births) # inner join by default.
In [97]:
mgirls['persons'] = np.floor(mgirls.prop * mgirls.births)
In [98]:
gpersons = mgirls.set_index(['year', 'name']).persons # Select out persons
In [144]:
type(merged.TTL) # One dimensional series.
Out[144]:
In [145]:
result = _ # underscore in ipython: the output of the last statement, because we don't want to compute again the same thing.
In [146]:
#pandas.DataFrame.mean
#DataFrame.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)
#Return the mean of the values for the requested axis
merged.TTL.mean()
Out[146]:
In [33]:
#pandas.DataFrame.mean
#DataFrame.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)
#Return the mean of the values for the requested axis
Res_Primary_E_Consumption.mean()
Out[33]:
In [34]:
#pandas.DataFrame.mean
#DataFrame.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs)
#Return the mean of the values for the requested axis
Res_Primary_E_Consumption.TOTAL.mean()
Out[34]:
In [35]:
Res_Primary_E_Consumption.TOTAL.mean(axis=None, skipna=None, level=None, numeric_only=None)
Out[35]:
In [ ]: