12752 Data Driven Final Project

Analyzing Fuel Oil Consumption in U.S. Households Census by Region and Division


In [171]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import copy
import sklearn
from sklearn import tree, feature_selection
from sklearn.feature_selection import SelectFromModel
from sklearn.cluster import KMeans
%matplotlib inline

In [30]:
f = open('/Users/apple/Desktop/data driven/project/dataset/oil consumption time series.csv')
t = pd.read_csv(f,sep=',', header='infer', parse_dates=[1])

In [31]:
t = t.set_index('Year')
del t.index.name

In [115]:
# Here is a part of example of the dataset
t


Out[115]:
Total Consumption Northeast Midwest South West Total Households Total Residential Buildings Total Floorspace Oil Price
1980 1524.12 1069.04 169.00 247.68 38.40 15.4 11.6 29.7 116.72
1981 1273.76 921.15 182.85 140.56 29.20 14.6 11.0 28.9 102.30
1982 1196.53 831.60 158.88 177.97 28.08 15.5 12.2 30.0 90.65
1984 1252.42 924.35 127.66 161.00 39.41 17.5 13.8 32.0 71.36
1987 1223.04 873.60 160.58 170.20 18.66 17.4 14.0 33.3 41.37
1990 1043.45 780.53 129.64 113.60 19.68 16.3 13.5 33.2 50.43
1993 1076.43 779.73 134.32 127.10 35.28 13.8 11.6 29.8 33.59
1997 1062.79 830.66 108.90 97.73 25.50 13.2 11.0 23.2 31.71
2001 754.47 601.37 62.53 74.52 16.05 11.2 9.4 26.0 35.81

In [33]:
t.index.tolist()


Out[33]:
[1980, 1981, 1982, 1984, 1987, 1990, 1993, 1997, 2001]

In [112]:
fig = plt.figure(figsize=(10,5))

plt.plot(t['Total Consumption'],label='Total Consumption')
plt.plot(t['Northeast'],'r',label='Northeast Consumption')
plt.plot(t['Midwest'],'g',label='Midwest Consumption')
plt.plot(t['South'],'black',label='South Consumption')
plt.plot(t['West'],'purple',label='West Consumption')

# plt.title('Oil Consumption Census by Region and Division,1980-2001')
plt.xlabel('Year')
plt.ylabel('Oil Consumption (trillion Btu)')
            
fig.tight_layout()
plt.legend()
plt.show()



In [35]:
Consumption=[]
for i in range (9):
    val=float(t['Total Consumption'].values[i])
    Consumption.append(val)

In [36]:
Northeast=[]
for i in range (9):
    val=float(t['Northeast'].values[i])
    Northeast.append(val)
    
Midwest=[]
for i in range (9):
    val=float(t['Midwest'].values[i])
    Midwest.append(val)
    
South=[]
for i in range (9):
    val=float(t['South'].values[i])
    South.append(val)
    
West=[]
for i in range (9):
    val=float(t['West'].values[i])
    West.append(val)

In [113]:
fig = plt.figure(figsize=(10,5))
x = np.array(t.index.tolist())
width = 0.18
opacity = 0.6
plt.bar(x, Consumption, width, alpha=opacity,color="blue",label='Total Consumption')
plt.bar(x+width, Northeast, width, alpha=opacity,color="red",label='Northeast Consumption')
plt.bar(x+2*width, Midwest, width, alpha=opacity,color="green",label='Midwest Consumption')
plt.bar(x+3*width, South, width, alpha=opacity,color="black",label='South Consumption')
plt.bar(x+4*width, West, width, alpha=opacity,color="purple",label='West Consumption')

plt.xticks(x + 2*width, (t.index))

# plt.title('Oil Consumption Census by Region and Division,1980-2001')
plt.xlabel('Year')
plt.ylabel('Oil Consumption (trillion Btu)')
fig.tight_layout()
plt.legend()


Out[113]:
<matplotlib.legend.Legend at 0x118d47e48>

In [39]:
y=Consumption

In [40]:
Year=t.index.tolist()

In [41]:
Households=[]
for i in range (9):
    val=float(t['Total Households'].values[i])
    Households.append(val)

In [42]:
Oil_price=[]
for i in range (9):
    val=float(t['Oil Price'].values[i])
    Oil_price.append(val)

In [43]:
Buildings=[]
for i in range (9):
    val=float(t['Total Residential Buildings'].values[i])
    Buildings.append(val)

In [44]:
Floorspace=[]
for i in range (9):
    val=float(t['Total Floorspace'].values[i])
    Floorspace.append(val)

In [191]:
# regression
X= []
for i in range(len(y)):
    tmp = []
    tmp.append(Year[i])
    tmp.append(Households[i])
    tmp.append(Oil_price[i])
    tmp.append(Buildings[i])
    tmp.append(Floorspace[i])
    X.append(tmp)

In [192]:
# fit a regression tree
clf = tree.DecisionTreeRegressor(max_depth=1)
clf = clf.fit(X,y)

In [70]:
# Using the model above to predict one instance
result=clf.predict([2003,11,41,9,26])[0]
print('The predicted oil consumption for 2003 is '+str(result)+' million btu.')


The predicted oil consumption for 2003 is 984.285 million btu.
/Applications/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)

In [193]:
#  calculate the score
clf.score(X,y)


Out[193]:
0.60252093368499926

In [194]:
clf.feature_importances_


Out[194]:
array([ 1.,  0.,  0.,  0.,  0.])

In [195]:
sfm = SelectFromModel(clf, threshold='median')
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]
sfm.transform(X)


Out[195]:
array([[ 1980.  ,    15.4 ,   116.72,    11.6 ,    29.7 ],
       [ 1981.  ,    14.6 ,   102.3 ,    11.  ,    28.9 ],
       [ 1982.  ,    15.5 ,    90.65,    12.2 ,    30.  ],
       [ 1984.  ,    17.5 ,    71.36,    13.8 ,    32.  ],
       [ 1987.  ,    17.4 ,    41.37,    14.  ,    33.3 ],
       [ 1990.  ,    16.3 ,    50.43,    13.5 ,    33.2 ],
       [ 1993.  ,    13.8 ,    33.59,    11.6 ,    29.8 ],
       [ 1997.  ,    13.2 ,    31.71,    11.  ,    23.2 ],
       [ 2001.  ,    11.2 ,    35.81,     9.4 ,    26.  ]])

In [199]:
# change the parameter to optimize the regression model
clf2 = tree.DecisionTreeRegressor(max_depth=3)
clf2 = clf2.fit(X,y)
clf2.score(X,y)


Out[199]:
0.99531772059894064

In [200]:
clf2.feature_importances_


Out[200]:
array([ 0.8066682 ,  0.        ,  0.18805645,  0.        ,  0.00527535])

In [201]:
sfm = SelectFromModel(clf2, threshold='median')
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]
sfm.transform(X)


Out[201]:
array([[ 1980.  ,   116.72,    29.7 ],
       [ 1981.  ,   102.3 ,    28.9 ],
       [ 1982.  ,    90.65,    30.  ],
       [ 1984.  ,    71.36,    32.  ],
       [ 1987.  ,    41.37,    33.3 ],
       [ 1990.  ,    50.43,    33.2 ],
       [ 1993.  ,    33.59,    29.8 ],
       [ 1997.  ,    31.71,    23.2 ],
       [ 2001.  ,    35.81,    26.  ]])

In [202]:
clf4 = tree.DecisionTreeRegressor(max_depth=4)
clf4 = clf4.fit(X,y)
clf4.score(X,y)


Out[202]:
0.99900660366227179

In [74]:
# Using the model above to predict one instance
result=clf4.predict([2003,11,41,9,26])[0]
print('The predicted oil consumption for 2003 is '+str(result)+' million btu.')


The predicted oil consumption for 2003 is 754.47 million btu.
/Applications/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)

In [55]:
#feature_importances_  
# The feature importances. The higher, the more important the feature.

In [203]:
clf4.feature_importances_


Out[203]:
array([ 0.80368953,  0.00369255,  0.18736204,  0.        ,  0.00525587])

In [204]:
sfm = SelectFromModel(clf4, threshold='median')
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]
sfm.transform(X)


Out[204]:
array([[ 1980.  ,    15.4 ,   116.72],
       [ 1981.  ,    14.6 ,   102.3 ],
       [ 1982.  ,    15.5 ,    90.65],
       [ 1984.  ,    17.5 ,    71.36],
       [ 1987.  ,    17.4 ,    41.37],
       [ 1990.  ,    16.3 ,    50.43],
       [ 1993.  ,    13.8 ,    33.59],
       [ 1997.  ,    13.2 ,    31.71],
       [ 2001.  ,    11.2 ,    35.81]])

In [205]:
# get rid of the Building 
X2= []
for i in range(len(y)):
    tmp = []
    tmp.append(Year[i])
    tmp.append(Households[i])
    tmp.append(Oil_price[i])
#     tmp.append(Buildings[i])
    tmp.append(Floorspace[i])
    X2.append(tmp)

In [206]:
clf2_ = tree.DecisionTreeRegressor(max_depth=3)
clf2_ = clf2_.fit(X2,y)
clf2_.score(X2,y)


Out[206]:
0.99531772059894064

In [207]:
clf2_.feature_importances_


Out[207]:
array([ 0.8066682 ,  0.00527535,  0.18805645,  0.        ])

In [108]:
x1=pd.read_excel('/Users/apple/Desktop/data driven/project/dataset/oil4.xls')
explode = (0.05, 0.1, 0.1)
labels = x1.columns
x1=np.array(x1)
plt.pie(x1[0],explode=explode,labels=labels,autopct='%1.1f%%', shadow=True, startangle=90)
plt.show()



In [182]:
oil=pd.read_excel('/Users/apple/Desktop/data driven/project/dataset/oil.xls')
oil=oil.drop(oil.columns[[1]],axis=1)
oil=oil.drop(oil.columns[[1]],axis=1)
oil=oil.drop(oil.columns[[0]],axis=1)
oil=oil.dropna()
oil=oil.T
oilindex=oil.index
A=np.array(oil.dropna())
kmeans = KMeans(n_clusters=4, random_state=0).fit(A)
c=kmeans.labels_
a=kmeans.cluster_centers_
plt.figure(figsize=(16,9))
k1=0
k2=0
for i in range(len(A)):
    if c[i]==0:
        k1=k1+1
        plt.subplot(2,2,1)
        plt.plot(A[i],linewidth=0.5,color='grey')
        plt.text(6.3,87-k1*4,oilindex[i],fontsize=10)
    if c[i]==1:
        k2=k2+1
        plt.subplot(2,2,2)
        plt.plot(A[i],linewidth=0.5,color='grey')
        plt.text(6.2,190-k2*6,oilindex[i],fontsize=10)
    if c[i]==2:
        k1=k1+1
        plt.subplot(2,2,3)
        plt.plot(A[i],linewidth=0.5,color='grey')
        plt.text(5.8,120-k1*6,oilindex[i],fontsize=10)
    if c[i]==3:
        k2=k2+1
        plt.subplot(2,2,4)
        plt.plot(A[i],linewidth=0.5,color='grey')
        plt.text(5.8,82-k2*6,oilindex[i],fontsize=10)
text=oil.columns
for i in range(4):
    if i==0:
        plt.subplot(2,2,1)
        plt.plot(a[i],linewidth=2,color='black')
        plt.xlabel('Year''\n'
                   '(a)')
        plt.ylabel('Consumption per building (million Btu)')
#         plt.title('Census Region and Division(k=4) i=1')
        for j in range(len(text)):
            plt.text(j, 40, text[j], fontsize=10,rotation=55)
    if i==1:
        plt.subplot(2,2,2)
        plt.plot(a[i],linewidth=2,color='black')
        plt.xlabel('Year''\n'
                   '(b)')
        plt.ylabel('Consumption per building (million Btu)')
#         plt.title('Census Region and Division(k=4) i=2')
        for j in range(len(text)):
            plt.text(j, 140, text[j], fontsize=10,rotation=55)
    if i==2:
        plt.subplot(2,2,3)
        plt.plot(a[i],linewidth=2,color='black')
        plt.xlabel('Year''\n'
                   '(c)')
        plt.ylabel('Consumption per building (million Btu)')
#         plt.title('Census Region and Division(k=4) i=3')
        for j in range(len(text)):
            plt.text(j, 60, text[j], fontsize=10,rotation=55)
    if i==3:
        plt.subplot(2,2,4)
        plt.plot(a[i],linewidth=2,color='black')
        plt.xlabel('Year''\n'
                   '(d)')
        plt.ylabel('Consumption per building (million Btu)')
#         plt.title('Census Region and Division(k=4) i=4')
        for j in range(len(text)):
            plt.text(j, 20, text[j], fontsize=10,rotation=55)



In [216]:
# A1=pd.DataFrame(A)
# # A1=A1.T
# A1.boxplot()
# for j in range(len(text)):
#         plt.text(j, 280, text[j], fontsize=10,rotation=-60)
# plt.ylim([0,300])
# plt.xlabel('Region')
# plt.ylabel('Consumption per building (million Btu)')
# # plt.title('Census Region and Division')


%matplotlib inline
oil=pd.read_excel('/Users/apple/Desktop/data driven/project/dataset/oil.xls')
oil=oil.drop(oil.columns[[1]],axis=1)
oil=oil.drop(oil.columns[[1]],axis=1)
oilindex=oil.index
A=np.array(oil.dropna())
A1=pd.DataFrame(A)
# A1=A1.T
A1.boxplot()
text=oil.columns
for j in range(len(text)):
        plt.text(j, 280, text[j], fontsize=10,rotation=-60)
plt.ylim([0,300])
plt.xlabel('Region')
plt.ylabel('Consumption per building (million Btu)')
# plt.title('Census Region and Division')


/Applications/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:20: FutureWarning: 
The default value for 'return_type' will change to 'axes' in a future release.
 To use the future behavior now, set return_type='axes'.
 To keep the previous behavior and silence this warning, set return_type='dict'.
Out[216]:
<matplotlib.text.Text at 0x11b940ef0>

In [214]:
plt.imshow(A)
plt.colorbar()
for j in range(len(text)):
        plt.text(j, 10.5, text[j], fontsize=10,rotation=-60)
for j in range(len(oilindex)):
        plt.text(-3.2, j, oilindex[j], fontsize=10)
plt.xlabel('Region')
plt.ylabel('year')
# plt.title('Census Region and Division')


Out[214]:
<matplotlib.text.Text at 0x117187668>

In [ ]: