In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import targetsmosh as tm
import ghgmosh as gm
import finmosh as fm

In [2]:
import statshelpers as sh

In [3]:
from bokeh.charts import Scatter
from collections import OrderedDict
from bokeh.plotting import output_notebook, show
import datavis as dv


BokehJS successfully loaded.

In [4]:
output_notebook()

In [27]:
reload(tm)


Out[27]:
<module 'targetsmosh' from 'targetsmosh.py'>

In [13]:
ghgfintars = pd.read_pickle("../CDPdata/ghgfintars.pkl")

In [6]:
ghgfintars.columns


Out[6]:
Index([u'Country', u'GICS Industry', u'GICS Sector', u'scope1', u'scope2', u'Revenues', u'COGS', u'Equity', u'PPE', u'Assets', u'Income', u'1and2 total', u'1and2 intensity', u'has absolute', u'has intensity', u'target type'], dtype='object')

In [25]:
gft = ghgfintars.rename(columns ={"GICS Sector": "Sector", "GICS Industry":"Industry",
                                  "has intensity": "hasintensity",
                                  "has target": "hastarget", "has absolute": "hasabsolute",
                                  "Scope 1": "scope1", "Scope 2": "scope2",
                                  "1and2 total": "s1and2total", "1and2 intensity": "s1and2intensity"})

prep other columns


In [26]:
gft["lnintensity"]=np.log(gft["s1and2intensity"])

In [403]:
gft["lnCOGS"] = np.log(gft["COGS"])
gft["lnPPE"] = np.log(gft["PPE"])

In [27]:
gft = fm.compute_finvars(gft)

In [89]:
industry_vcs = gft["Industry"].value_counts()

In [86]:
s = gft.reset_index()
gft = s[~s.duplicated(["Organisation", "year"])]

In [87]:
# with outliers and duplicates removed
gft.to_pickle("../CDPdata/gft.pkl")

get outliers, then variance and extremes


In [448]:
reload(sh)


Out[448]:
<module 'statshelpers' from 'statshelpers.py'>

In [35]:
gft = gft.reset_index().set_index("year")

In [36]:
outliers = gft[abs(gft["lnintensity"])>10]
ocompanies = outliers["Organisation"].values

In [447]:
# more outliers with increased emissions
# probably reporting mistakes and restructuring
oo = profiles2[abs(profiles2["pcintensity"])>10]

In [444]:
oo[["pcintensity"]]


Out[444]:
pcintensity
1898 12.616568
2997 18.368124
3034 64.888109

In [445]:
oocos = oo["Organisation"].value_counts().index

In [450]:
profiles2 = sh.exclude_cos(profiles2,oocos)

In [41]:
outliers.to_pickle("../CDPdata/lnint_outliers.pkl")
outliers.to_excel("../CDPdata/lnintoutliers.xlsx")

In [42]:
# throw out outliers 10 companies, 1233 companies left
gfti = gft[~gft["Organisation"].isin(set(ocompanies))]

In [301]:
gfti.reset_index(inplace=True)

In [302]:
gfti=gfti[~gfti.duplicated(["Organisation", "year"])]

In [58]:
gft_year = {}
extremes = {}
for yr in range(2009, 2014):
    gft_year[yr] = gfti.loc[yr]
    gft_year[yr] = sh.getvars(gft_year[yr], "lnintensity")
    extremes[yr] = sh.getextremes(gft_year[yr], "tlnintensity", 4)
extremes_all = pd.concat(extremes.values())
gfts_all = pd.concat(gft_year.values())

In [59]:
len(extremes_all)
extremes_all.sort("tlnintensity", inplace=True)
extremes_all["Organisation"].value_counts()
ecompanies = set(extremes_all["Organisation"].tolist())
len(ecompanies)


Out[59]:
1

In [60]:
# remove extreme
gfti = gfti[~gfti["Organisation"].isin(set(ecompanies))]

In [103]:
# get rid of companies that only have 1 year
ocounts = gfti["Organisation"].value_counts()
singles = ocounts[ocounts==1].index.tolist()
gfti = gfti[~gfti["Organisation"].isin(singles)]

In [303]:
# with outliers and extreme and duplicates removed
gfti.to_pickle("../CDPdata/gfti.pkl")
len(gfti["Organisation"].value_counts().index) # 1116


Out[303]:
1116

In [114]:
gfti[gfti["Organisation"].apply(lambda x: "Apple" in unicode(x).encode('utf-8'))]


Out[114]:
Organisation Country Industry Sector scope1 scope2 Revenues COGS Equity PPE ... Income s1and2total s1and2intensity hasabsolute hasintensity target type lnintensity pCOGS pPPE ROE
year
2009 Apple Inc. USA Computers & Peripherals Information Technology 24476 141464 46275.036504 26962.583325 34125.210465 3186.026287 ... 8881.830221 165940 3.585951 False False NaN 1.277024 0.582659 0.062188 0.260272
2013 Apple Inc. USA Computers & Peripherals Information Technology 30393 91505 170757.294798 99759.786603 123438.611053 16582.170860 ... 37003.908065 121898 0.713867 False False NaN -0.337059 0.584220 0.080179 0.299776

2 rows × 21 columns

export to r variable


In [72]:
import pandas.rpy.common as com

In [426]:
def format_forR(df):
    rdf = df[df["year"].isin(range(2009,2014))][["Organisation", "year", "Revenues", "Country",
                                               "country", "Industry", "Sector", 
                                               "lnintensity", "hasintensity", "hasabsolute",
                                               "pCOGS", "lnCOGS", "pPPE", "lnPPE", "ROE",
                                               "plateau", "steady"]+sector_dummies+cdummies]
    return rdf

In [461]:
# write_rvar(gfti, "gfti.csv")

In [474]:
write_rvar(profiles2, "profiles2.csv")

In [476]:
write_rvar(plateaus, "plateaus.csv")
write_rvar(steadys, "steadys.csv")
write_rvar(notplst, "notplst.csv")

In [464]:
write_rvar(us, "us.csv")
write_rvar(uk, "uk.csv")
write_rvar(japan, "japan.csv")
write_rvar(france, "france.csv")
write_rvar(germany, "germany.csv")
write_rvar(safrica, "safrica.csv")
write_rvar(skorea, "skorea.csv")
write_rvar(notus, "notus.csv")

In [466]:
for s in sector_dummies:
    write_rvar(profiles2[profiles2[s]], s+".csv")

In [333]:
def write_rvar(p, fname):
    p_r = format_forR(p)
    p_r=p_r[~p_r.duplicated(["Organisation", "year"])]
    rp = com.convert_to_r_dataframe(p_r)
    rp.to_csvfile("../CDPdata/" + fname)

calc % change

to find plateaus


In [317]:
ggfti = gfti.reset_index().groupby("Organisation")

In [318]:
result = ggfti.apply(get_changes)

In [226]:
def get_changes(group):
    try:
        diffs = group["year"].diff()
        raw_pc = group["s1and2intensity"].pct_change()
        group["pcintensity"] = (raw_pc+1)**(1.0/diffs)-1            
        return group
    except Exception:
        print group["Organisation"]

In [188]:
gr = ggfti.get_group('ACC')
ga = ggfti.get_group("Apple Inc.")

In [470]:
# for d in diffs[1:]:
#    if d>1:
        # need to make new rows
        # crap, I would need to pull in the missing financials...
        # too crazy

In [233]:
result.set_index(["Organisation", "year"], inplace=True)

In [469]:
# len(result.index.levels[0])
len(profiles2["Organisation"].value_counts().index)


Out[469]:
1108

check profiles for plateaus and steady companies


In [320]:
gresult= result.reset_index().groupby("Organisation")

In [321]:
gr = gresult.get_group('ACC')
ga = gresult.get_group("Apple Inc.")

In [323]:
profiles2 = gresult.apply(check_profiles)

In [322]:
def check_profiles(group):
    group["plateau"] = False
    group["steady"] = False
    for i in range(1,4):
        pci = group["pcintensity"][i:i+3].tolist()
        if len(pci)==3:
            if is_plateau(pci, -.15, 0.1)==True:
                group["plateau"] = True
            if is_steady(pci, -.06)==True:
                group["steady"]=True
    return group

In [264]:
def is_plateau(a, low_thresh, nochange):
    is_p = (a[0] < low_thresh) & (abs(a[1])<nochange) & (abs(a[2])<nochange)
    return is_p

In [269]:
def is_steady(a, threshold):
    is_s= (a[0]<threshold) & (a[1]<threshold) & (a[2]<threshold)
    return is_s

In [295]:
# thresholds and # of companies they apply to
# plateaus
# [-.2, 0.05, 10]
# ps = [-.2, 0.1, 27]
# p1p = [-.15, 0.1, 50]
# p2p = [-.15,0.1, 50]
# [-.15, 0.07, 28]

# steady
# ps = [-.1, 24]
# p1s = [-.05, 75]
# p2s = [-.06, 60]

In [472]:
len(profiles2[profiles2["plateau"]==True]["Organisation"].value_counts().index)
# len(profiles2[profiles2["steady"]==True]["Organisation"].value_counts().index)


Out[472]:
50

In [475]:
notplst = profiles2[(profiles2["plateau"]==False) & (profiles2["steady"]==False)]

In [453]:
profiles2.to_pickle("../CDPdata/profiles2.pkl")

In [454]:
plateaus = profiles2[profiles2["plateau"]]
steadys = profiles2[profiles2["steady"]]

compare sectors and countries


In [455]:
steadys["Sector"].value_counts()
plateaus["Sector"].value_counts()


Out[455]:
Industrials               55
Materials                 32
Consumer Discretionary    32
Information Technology    30
Energy                    29
Health Care               19
Utilities                 19
Consumer Staples          16
Financials                 5
dtype: int64

In [377]:
sectors = ["Industrials", "Consumer Discretionary", "Materials", "Information Technology",
                  "Consumer Staples", "Utilities", "Energy"]
sector_dummies = ["Industrials", "ConsumerDiscretionary", "Materials", "InformationTechnology",
                  "ConsumerStaples", "Utilities", "Energy"]
profiles2["Sector"].value_counts()


Out[377]:
Industrials                   1076
Consumer Discretionary         656
Materials                      589
Information Technology         580
Consumer Staples               461
Utilities                      292
Health Care                    278
Energy                         236
Financials                     195
Telecommunication Services     156
dtype: int64

In [488]:
profiles2[profiles2["Sector"]== "Consumer Staples"][["Organisation", "COGS", "PPE"]]


Out[488]:
Organisation COGS PPE
138 Aeon Co., Ltd. 3490632.863380 1804268.768539
139 Aeon Co., Ltd. 4112856.945421 2180858.696981
182 Ajinomoto Co.Inc. 797886.032632 444187.501664
183 Ajinomoto Co.Inc. 812300.639979 411809.853421
184 Ajinomoto Co.Inc. 783819.861090 400758.555053
185 Ajinomoto Co.Inc. 766834.601919 356450.874448
186 Ajinomoto Co.Inc. 598999.325591 359048.908967
234 Altria Group, Inc. 8136.554606 2894.818739
235 Altria Group, Inc. 7801.093242 2525.724652
236 Altria Group, Inc. 7671.144994 2284.846412
237 Altria Group, Inc. 7837.932782 2133.281718
238 Altria Group, Inc. 7007.733109 2026.188016
276 AmorePacific Corporation 549249.424543 1013456.639472
277 AmorePacific Corporation 578741.042543 1226724.342495
278 AmorePacific Corporation 758242.441784 1706907.432475
279 AmorePacific Corporation 811055.244388 1793100.919322
280 AmorePacific Corporation 855109.815550 1934313.891601
281 AmorePacific Group 938949.197889 2013480.172456
282 AmorePacific Group 965182.412790 2133860.120406
283 AmorePacific Group 1123517.323505 2286324.844487
300 Anheuser Busch InBev 15654.023536 17753.953523
301 Anheuser Busch InBev 14377.528396 16866.110038
302 Anheuser Busch InBev 14378.241524 16519.769501
303 Anheuser Busch InBev 13966.804473 16705.970675
304 Anheuser Busch InBev 14679.872052 20870.336031
346 Archer Daniels Midland 77896.146926 9795.144817
347 Archer Daniels Midland 85835.665012 9958.021035
348 Archer Daniels Midland 84989.994964 10127.942762
358 Asahi Group Holdings, Ltd. 890317.830883 552895.713286
359 Asahi Group Holdings, Ltd. 946889.871179 592080.060735
... ... ... ...
4119 Treasury Wine Estates 1049.286379 1143.467417
4120 Treasury Wine Estates 1230.499586 1250.381806
4175 Uni-Charm Corporation 192530.252682 102845.877658
4176 Uni-Charm Corporation 198860.489636 104413.244861
4177 Uni-Charm Corporation 225024.377420 120450.380284
4178 Uni-Charm Corporation 253456.601442 165556.666680
4179 Uni-Charm Corporation 302769.238872 204234.356859
4180 Unilever plc 27736.756919 7591.752767
4181 Unilever plc 29943.075074 8031.775221
4182 Unilever plc 28068.898485 9335.651294
4316 Wal-Mart Stores, Inc. 320867.576274 110342.854204
4317 Wal-Mart Stores, Inc. 326482.809343 114483.245374
4318 Wal-Mart Stores, Inc. 337156.102085 115813.668048
4319 Wal-Mart Stores, Inc. 349106.174262 118417.432982
4320 Wal-Mart Stores, Inc. 348886.996584 117801.652085
4354 Wesfarmers 39400.196726 8559.925502
4355 Wesfarmers 40812.460040 9603.827258
4356 Wesfarmers 41325.043751 10154.918637
4381 Whole Foods Market, Inc. 5416.471182 2046.922663
4382 Whole Foods Market, Inc. 5948.498618 2001.615562
4383 Whole Foods Market, Inc. 6492.467379 2059.261134
4384 Whole Foods Market, Inc. 7354.091904 2225.314252
4385 Whole Foods Market, Inc. 7951.888768 2425.830623
4421 Woolworths Holdings Ltd 15207.181732 2089.036667
4422 Woolworths Holdings Ltd 16145.535652 2112.906631
4426 Woolworths Limited 39091.916982 7176.540389
4427 Woolworths Limited 39895.306596 8106.833272
4428 Woolworths Limited 40550.249835 8888.114407
4429 Woolworths Limited 40502.616601 9731.702375
4430 Woolworths Limited 41909.620974 9237.838766

461 rows × 3 columns


In [375]:
for s in sectors:
    sname = s.translate(None, " ")
    profiles2[sname] = profiles2["Sector"]==s

In [382]:
countries = ["USA", "United Kingdom", "Japan", "South Africa", "France", "South Korea", "Germany"]
profiles2["Country"].value_counts()[0:10]


Out[382]:
USA               1359
United Kingdom     555
Japan              493
South Africa       200
France             190
South Korea        188
Germany            178
Sweden             137
Switzerland        107
Spain               99
dtype: int64

In [383]:
for c in countries:
    cname = c.translate(None, " ")
    profiles2[cname] = profiles2["Country"]==c

In [384]:
cdummies = ["USA", "UnitedKingdom", "Japan", "SouthAfrica", "France", "SouthKorea", "Germany"]
cfactors = profiles2["Country"].value_counts().index[0:10]

In [397]:
profiles2["country"] = profiles2["Country"].apply(lambda x: x if x in cfactors else "Other")

In [456]:
us = profiles2[profiles2["Country"] == "USA"]
uk = profiles2[profiles2["Country"]== "United Kingdom"]
japan = profiles2[profiles2["Country"]== "Japan"]
safrica= profiles2[profiles2["Country"]== "South Africa"]
france = profiles2[profiles2["Country"]== "France"]
skorea = profiles2[profiles2["Country"]=="South Korea"]
germany = profiles2[profiles2["Country"]== "Germany"]
notus = profiles2[profiles2["Country"] != "USA"]

cluster


In [207]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [98]:
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
                            random_state=0)

X = StandardScaler().fit_transform(X)

In [154]:
tos = tos[tos['pc1and2intensity']!=0]
tos = tos[tos['pc1and2intensity']>-10]
# stats.zscore(tos['pc1and2intensity'])

In [165]:
toscountries = pd.get_dummies(tos["Country"])
toscountries = gm.drop_dups(toscountries)
tcs = tos[['revt', 'pc1and2intensity', 'year','hasintensity']].join(toscountries)

In [168]:
tcs[['revt', 'pc1and2intensity', 'year','hasintensity']].head()


Out[168]:
revt pc1and2intensity year hasintensity
ISIN
AN8068571086 39540.0 0.210166 2011 False
AN8068571086 42149.0 -0.016996 2012 False
AN8068571086 45266.0 -0.217520 2013 True
AN8068571086 48580.0 0.247797 2014 True
AT0000746409 3778.4 -0.160836 2011 True

In [169]:
X = tcs.fillna(0).values
# X = X[(np.abs(stats.zscore(X)) < 3).all(axis=1)]

In [199]:
# Compute DBSCAN
db = DBSCAN(eps=10.68, min_samples=3).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

In [220]:
tcs['labels'] = pd.Series(labels, index = tcs.index)
tcs.head()


Out[220]:
revt pc1and2intensity year hasintensity Australia Austria Belgium Bermuda Brazil Canada ... Portugal South Africa South Korea Spain Sweden Switzerland Taiwan USA United Kingdom labels
ISIN
AN8068571086 39540.0 0.210166 2011 False 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 -1
AN8068571086 42149.0 -0.016996 2012 False 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 -1
AN8068571086 45266.0 -0.217520 2013 True 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 -1
AN8068571086 48580.0 0.247797 2014 True 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 -1
AT0000746409 3778.4 -0.160836 2011 True 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 -1

5 rows × 31 columns


In [21]:
# Plot result
import matplotlib.pyplot as plt

In [201]:
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = 'k'

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=6)

In [202]:
plt.show()

In [225]:
tcsplot = tcs[['revt', 'pc1and2intensity', 'labels']]

In [226]:
g = tcsplot.groupby("labels")
xyvalues = dv.prep_groups(g)

In [229]:
tcscatter = dv.scatter_groups(xyvalues.values(), "targetintensity.html", "tcs", "revt", "change intensity")
# tcscatter.show()

In [232]:
show(tcscatter)



In [208]:
kmeans = KMeans(init='k-means++', n_clusters=4, n_init=10)

In [209]:
kmeans.fit(X)


Out[209]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=4, n_init=10,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)