In [2]:
import ghgmosh as gm
import datavis as dv
reload(gm)


Out[2]:
<module 'ghgmosh' from 'ghgmosh.pyc'>

In [4]:
scope1 = gm.getscope_allyears(1)
scope2 = gm.getscope_allyears(2)


ghgmosh.py:77: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  ptemp[jyr]["year"] = jyr

combine scope 1 and 2 for companies that report every year

index Organisation, year


In [11]:
# indexed by Organisation and year
scopes12 = scope1.join(scope2[['scope2']], how='outer')
scopes12.to_pickle("../CDPdata/2010to2014scopes1or2.pkl")

In [12]:
scopes12 = pd.read_pickle("../CDPdata/2010to2014scopes1or2.pkl")

In [13]:
s = scopes12
scopes12_complete = s[(s["scope2"].isnull()==False) & (s["scope1"].isnull()==False)].reset_index().set_index(["Organisation", "year"])
scopes12_incomplete = s[(s["scope2"].isnull()==True) | (s["scope1"].isnull()==True)].reset_index().set_index(["Organisation", "year"])

In [21]:
# so most companies that report scope 1 also report scope 2
len(scopes12) # 8023
len(scopes12.index.levels[0]) # 2727
s12_cos = scopes12.index.levels[0]
len(scopes12_complete.index.levels[0]) # 2646
s12_completecos = scopes12_complete.index.levels[0]
len(scopes12_complete) # 7787
incomplete_cos = set(s12_cos).difference(set(s12_completecos))

In [22]:
# some nans are b/c they say the emissions are not relevant (like for Alliant and Duke Energy)
# but some are b/c they say they're relevant but not calculated
# relevance is on a different sheet, can just use 2014 data
excuses = pd.read_pickle("../CDPdata/excuses_s12.pkl")

In [23]:
good_excuses =  ["Emissions are not relevant", "No emissions from this source",
                 "No emissions excluded", "Emissions excluded due to a recent acquisition"]
excuses[excuses.columns[1]].value_counts().index


Out[23]:
Index([u'Emissions are not relevant', u'Emissions are relevant but not yet calculated', u'Emissions are not evaluated', u'Emissions excluded due to a recent acquisition', u'No emissions from this source', u'Emissions are relevant and calculated, but not disclosed', u'No emissions excluded'], dtype='object')

In [24]:
excuses.drop_duplicates(cols="Organisation", inplace=True)
excuses["scope1 null"] = excuses[excuses.columns[1]].apply(lambda x: 0 if str(x) in good_excuses  else float("nan"))
excuses["scope2 null"] = excuses[excuses.columns[2]].apply(lambda x: 0 if str(x) in good_excuses  else float("nan"))


/home/mirthbottle/anaconda/lib/python2.7/site-packages/pandas/util/decorators.py:81: FutureWarning: the 'cols' keyword is deprecated, use 'subset' instead
  warnings.warn(msg, FutureWarning)

In [25]:
scopes12_incomplete = scopes12_incomplete.reset_index().set_index("Organisation")
excuses.set_index("Organisation", inplace=True)

In [26]:
incomplete_excuses = scopes12_incomplete.join(excuses[["scope1 null", "scope2 null"]])
complete_reasons = incomplete_excuses[(incomplete_excuses["scope1 null"].isnull()==False) & (incomplete_excuses["scope2 null"].isnull()==False)]

In [27]:
s12_addback = complete_reasons.drop(["scope1 null", "scope2 null"],1)
s12_addback["scope1"] = s12_addback["scope1"].fillna(0)
s12_addback["scope2"] = s12_addback["scope2"].fillna(0)

In [28]:
s12_addback = s12_addback.reset_index().set_index(["Organisation", "year"])

In [29]:
# more cleanup
s = pd.concat([s12_addback, scopes12_complete])
s12_completed = s[(s["Country"].isnull()==False) & (s["GICS Industry"].isnull()==False)]

In [30]:
s12_companies = s12_completed.index.levels[0]
len(s12_companies) #2648 yay


Out[30]:
2648

In [2]:
s12_completed = pd.read_pickle("../CDPdata/s12_completed.pkl")

In [31]:
s12_completed.to_pickle("../CDPdata/s12_completed.pkl")
s12_completed.head()


Out[31]:
Country Discloser ID GICS Industry GICS Industry Group GICS Sector GICS Sub-Industry Reporting Period From Reporting Period To reporting period to scope1 scope2
Organisation year
Agrium Inc. 2009 Canada d91c5885-e21b-df11-b692-0017a47708d8 Chemicals Materials Materials NaN 1 Jan 2009 31 Dec 2009 2009-12-31 3202620 0
2011 Canada e00e8ed3-e8fe-e011-871c-0017a47708d8 Chemicals Materials Materials NaN 1 Jan 2011 31 Dec 2011 2011-12-31 3441605 0
2012 Canada NaN Chemicals Materials Materials Diversified Chemicals 1 Jan 2012 31 Dec 2012 2012-12-31 3807001 0
Alliant Energy Corporation 2009 USA ed1d5885-e21b-df11-b692-0017a47708d8 Multi-Utilities Utilities Utilities NaN 1 Jan 2009 31 Dec 2009 2009-12-31 17608208 0
2010 USA f0d4061d-18e0-df11-a2b8-0017a47708d8 Multi-Utilities Utilities Utilities NaN 1 Jan 2010 31 Dec 2010 2010-12-31 18719000 0

not used


In [26]:
#  get all 5 years for balanced panel
s = scopes12_complete.reset_index()
s12_count = s[s["year"].isin(range(2009,2014))].groupby("Organisation").count()
s12_companies = s12_count[s12_count["year"]==5].index.tolist()

In [16]:
scopes12_complete5 = scopes12_complete[scopes12_complete.index.isin(s12_companies,level=0)].reset_index().set_index(["Organisation", "year"])

In [18]:
len(s12_companies)  # 643
len(scopes12_complete5) # 3308 has extra because some companies also have 2009 numbers
scopes12_complete5.to_pickle("../CDPdata/2010to2014scopes12fiveyrs.pkl")
scopes12_complete.to_pickle("../CDPdata/2010to2014scopes12.pkl")

get company infos


In [32]:
# len(s12_companies)
orginfos = pd.read_pickle("../CDPdata/orginfos.pkl")

In [33]:
p = pd.read_pickle("../CDPdata/sheet35_2014.pkl")
orginfos = p[["Organisation"] + p.columns[2:7].tolist()+["Ticker", "ISIN"]]
orginfos = orginfos.drop_duplicates("Organisation").set_index("Organisation")
orginfos.to_pickle("../CDPdata/orginfos.pkl")
len(orginfos)


Out[33]:
1825

In [34]:
s12_company_ids = orginfos[orginfos.index.isin(s12_companies)][["ISIN", "Ticker", "Country"]]
s12_company_ids["Ticker"] = s12_company_ids["Ticker"].apply(lambda(x): str(x).split(" ")[0])
len(s12_company_ids) # 1732


Out[34]:
1732

In [35]:
americans = s12_company_ids[s12_company_ids["Country"]=="USA"][["Ticker"]]
americans=americans[americans["Ticker"]!="nan"]
americans.to_excel("../CDPdata/s12_american_ids.xlsx")
len(americans) # 381


Out[35]:
381

In [17]:
world = s12_company_ids[s12_company_ids["Country"]!="USA"][["ISIN"]]
world=world[world["ISIN"].isnull()==False]
world.to_excel("../CDPdata/s12_world_ids.xlsx")
len(world) # 1264


Out[17]:
1264

Scope 3


In [286]:
p = pd.read_pickle("../CDPdata/sheet68_2014.pkl")
p.columns


Out[286]:
Index([u'Organisation', u'Account No', u'Country', u'GICS Sector', u'GICS Industry Group', u'GICS Industry', u'GICS Sub-Industry', u'Access', u'Primary Expansion', u'Secondary Expansions', u'Complexity', u'Ticker', u'ISIN', u'Complexity.1', u'CC14.1. Please account for your organization’s Scope 3 emissions, disclosing and explaining any exclusions\n\nCC14.1. Sources of Scope 3 emissions', u'CC14.1. Evaluation status', u'CC14.1. metric tonnes CO2e', u'CC14.1. Emissions calculation methodology', u'CC14.1. Percentage of emissions calculated using primary data', u'CC14.1. Explanation'], dtype='object')

example


In [163]:
N = 20
categories = ['y' + str(x) for x in range(10)]
data = {}
data['x'] = np.arange(N)
for cat in categories:
    data[cat] = np.random.randint(10, 100, size=N)
x2 = np.hstack((data['x'][::-1], data['x']))

In [164]:
df = pd.DataFrame(data)
df = df.set_index(['x'])
areas = dv.stacked_cols(df, categories)

In [167]:
p = figure()
p.patches([x2 for a in areas], list(areas.values()), color=colors, alpha=0.8, line_color=None)


Out[167]:
<bokeh.plotting.Figure at 0x7f09b9db5150>

In [ ]: