In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import targetsmosh as tm
import ghgmosh as gm
import finmosh as fm
In [2]:
import statshelpers as sh
In [3]:
from bokeh.charts import Scatter
from collections import OrderedDict
from bokeh.plotting import output_notebook, show
import datavis as dv
In [4]:
output_notebook()
In [27]:
reload(tm)
Out[27]:
In [13]:
ghgfintars = pd.read_pickle("../CDPdata/ghgfintars.pkl")
In [6]:
ghgfintars.columns
Out[6]:
In [25]:
gft = ghgfintars.rename(columns ={"GICS Sector": "Sector", "GICS Industry":"Industry",
"has intensity": "hasintensity",
"has target": "hastarget", "has absolute": "hasabsolute",
"Scope 1": "scope1", "Scope 2": "scope2",
"1and2 total": "s1and2total", "1and2 intensity": "s1and2intensity"})
In [26]:
gft["lnintensity"]=np.log(gft["s1and2intensity"])
In [403]:
gft["lnCOGS"] = np.log(gft["COGS"])
gft["lnPPE"] = np.log(gft["PPE"])
In [27]:
gft = fm.compute_finvars(gft)
In [89]:
industry_vcs = gft["Industry"].value_counts()
In [86]:
s = gft.reset_index()
gft = s[~s.duplicated(["Organisation", "year"])]
In [87]:
# with outliers and duplicates removed
gft.to_pickle("../CDPdata/gft.pkl")
In [448]:
reload(sh)
Out[448]:
In [35]:
gft = gft.reset_index().set_index("year")
In [36]:
outliers = gft[abs(gft["lnintensity"])>10]
ocompanies = outliers["Organisation"].values
In [447]:
# more outliers with increased emissions
# probably reporting mistakes and restructuring
oo = profiles2[abs(profiles2["pcintensity"])>10]
In [444]:
oo[["pcintensity"]]
Out[444]:
In [445]:
oocos = oo["Organisation"].value_counts().index
In [450]:
profiles2 = sh.exclude_cos(profiles2,oocos)
In [41]:
outliers.to_pickle("../CDPdata/lnint_outliers.pkl")
outliers.to_excel("../CDPdata/lnintoutliers.xlsx")
In [42]:
# throw out outliers 10 companies, 1233 companies left
gfti = gft[~gft["Organisation"].isin(set(ocompanies))]
In [301]:
gfti.reset_index(inplace=True)
In [302]:
gfti=gfti[~gfti.duplicated(["Organisation", "year"])]
In [58]:
gft_year = {}
extremes = {}
for yr in range(2009, 2014):
gft_year[yr] = gfti.loc[yr]
gft_year[yr] = sh.getvars(gft_year[yr], "lnintensity")
extremes[yr] = sh.getextremes(gft_year[yr], "tlnintensity", 4)
extremes_all = pd.concat(extremes.values())
gfts_all = pd.concat(gft_year.values())
In [59]:
len(extremes_all)
extremes_all.sort("tlnintensity", inplace=True)
extremes_all["Organisation"].value_counts()
ecompanies = set(extremes_all["Organisation"].tolist())
len(ecompanies)
Out[59]:
In [60]:
# remove extreme
gfti = gfti[~gfti["Organisation"].isin(set(ecompanies))]
In [103]:
# get rid of companies that only have 1 year
ocounts = gfti["Organisation"].value_counts()
singles = ocounts[ocounts==1].index.tolist()
gfti = gfti[~gfti["Organisation"].isin(singles)]
In [303]:
# with outliers and extreme and duplicates removed
gfti.to_pickle("../CDPdata/gfti.pkl")
len(gfti["Organisation"].value_counts().index) # 1116
Out[303]:
In [114]:
gfti[gfti["Organisation"].apply(lambda x: "Apple" in unicode(x).encode('utf-8'))]
Out[114]:
In [72]:
import pandas.rpy.common as com
In [426]:
def format_forR(df):
rdf = df[df["year"].isin(range(2009,2014))][["Organisation", "year", "Revenues", "Country",
"country", "Industry", "Sector",
"lnintensity", "hasintensity", "hasabsolute",
"pCOGS", "lnCOGS", "pPPE", "lnPPE", "ROE",
"plateau", "steady"]+sector_dummies+cdummies]
return rdf
In [461]:
# write_rvar(gfti, "gfti.csv")
In [474]:
write_rvar(profiles2, "profiles2.csv")
In [476]:
write_rvar(plateaus, "plateaus.csv")
write_rvar(steadys, "steadys.csv")
write_rvar(notplst, "notplst.csv")
In [464]:
write_rvar(us, "us.csv")
write_rvar(uk, "uk.csv")
write_rvar(japan, "japan.csv")
write_rvar(france, "france.csv")
write_rvar(germany, "germany.csv")
write_rvar(safrica, "safrica.csv")
write_rvar(skorea, "skorea.csv")
write_rvar(notus, "notus.csv")
In [466]:
for s in sector_dummies:
write_rvar(profiles2[profiles2[s]], s+".csv")
In [333]:
def write_rvar(p, fname):
p_r = format_forR(p)
p_r=p_r[~p_r.duplicated(["Organisation", "year"])]
rp = com.convert_to_r_dataframe(p_r)
rp.to_csvfile("../CDPdata/" + fname)
In [317]:
ggfti = gfti.reset_index().groupby("Organisation")
In [318]:
result = ggfti.apply(get_changes)
In [226]:
def get_changes(group):
try:
diffs = group["year"].diff()
raw_pc = group["s1and2intensity"].pct_change()
group["pcintensity"] = (raw_pc+1)**(1.0/diffs)-1
return group
except Exception:
print group["Organisation"]
In [188]:
gr = ggfti.get_group('ACC')
ga = ggfti.get_group("Apple Inc.")
In [470]:
# for d in diffs[1:]:
# if d>1:
# need to make new rows
# crap, I would need to pull in the missing financials...
# too crazy
In [233]:
result.set_index(["Organisation", "year"], inplace=True)
In [469]:
# len(result.index.levels[0])
len(profiles2["Organisation"].value_counts().index)
Out[469]:
In [320]:
gresult= result.reset_index().groupby("Organisation")
In [321]:
gr = gresult.get_group('ACC')
ga = gresult.get_group("Apple Inc.")
In [323]:
profiles2 = gresult.apply(check_profiles)
In [322]:
def check_profiles(group):
group["plateau"] = False
group["steady"] = False
for i in range(1,4):
pci = group["pcintensity"][i:i+3].tolist()
if len(pci)==3:
if is_plateau(pci, -.15, 0.1)==True:
group["plateau"] = True
if is_steady(pci, -.06)==True:
group["steady"]=True
return group
In [264]:
def is_plateau(a, low_thresh, nochange):
is_p = (a[0] < low_thresh) & (abs(a[1])<nochange) & (abs(a[2])<nochange)
return is_p
In [269]:
def is_steady(a, threshold):
is_s= (a[0]<threshold) & (a[1]<threshold) & (a[2]<threshold)
return is_s
In [295]:
# thresholds and # of companies they apply to
# plateaus
# [-.2, 0.05, 10]
# ps = [-.2, 0.1, 27]
# p1p = [-.15, 0.1, 50]
# p2p = [-.15,0.1, 50]
# [-.15, 0.07, 28]
# steady
# ps = [-.1, 24]
# p1s = [-.05, 75]
# p2s = [-.06, 60]
In [472]:
len(profiles2[profiles2["plateau"]==True]["Organisation"].value_counts().index)
# len(profiles2[profiles2["steady"]==True]["Organisation"].value_counts().index)
Out[472]:
In [475]:
notplst = profiles2[(profiles2["plateau"]==False) & (profiles2["steady"]==False)]
In [453]:
profiles2.to_pickle("../CDPdata/profiles2.pkl")
In [454]:
plateaus = profiles2[profiles2["plateau"]]
steadys = profiles2[profiles2["steady"]]
In [455]:
steadys["Sector"].value_counts()
plateaus["Sector"].value_counts()
Out[455]:
In [377]:
sectors = ["Industrials", "Consumer Discretionary", "Materials", "Information Technology",
"Consumer Staples", "Utilities", "Energy"]
sector_dummies = ["Industrials", "ConsumerDiscretionary", "Materials", "InformationTechnology",
"ConsumerStaples", "Utilities", "Energy"]
profiles2["Sector"].value_counts()
Out[377]:
In [488]:
profiles2[profiles2["Sector"]== "Consumer Staples"][["Organisation", "COGS", "PPE"]]
Out[488]:
In [375]:
for s in sectors:
sname = s.translate(None, " ")
profiles2[sname] = profiles2["Sector"]==s
In [382]:
countries = ["USA", "United Kingdom", "Japan", "South Africa", "France", "South Korea", "Germany"]
profiles2["Country"].value_counts()[0:10]
Out[382]:
In [383]:
for c in countries:
cname = c.translate(None, " ")
profiles2[cname] = profiles2["Country"]==c
In [384]:
cdummies = ["USA", "UnitedKingdom", "Japan", "SouthAfrica", "France", "SouthKorea", "Germany"]
cfactors = profiles2["Country"].value_counts().index[0:10]
In [397]:
profiles2["country"] = profiles2["Country"].apply(lambda x: x if x in cfactors else "Other")
In [456]:
us = profiles2[profiles2["Country"] == "USA"]
uk = profiles2[profiles2["Country"]== "United Kingdom"]
japan = profiles2[profiles2["Country"]== "Japan"]
safrica= profiles2[profiles2["Country"]== "South Africa"]
france = profiles2[profiles2["Country"]== "France"]
skorea = profiles2[profiles2["Country"]=="South Korea"]
germany = profiles2[profiles2["Country"]== "Germany"]
notus = profiles2[profiles2["Country"] != "USA"]
In [207]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from scipy import stats
In [98]:
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
random_state=0)
X = StandardScaler().fit_transform(X)
In [154]:
tos = tos[tos['pc1and2intensity']!=0]
tos = tos[tos['pc1and2intensity']>-10]
# stats.zscore(tos['pc1and2intensity'])
In [165]:
toscountries = pd.get_dummies(tos["Country"])
toscountries = gm.drop_dups(toscountries)
tcs = tos[['revt', 'pc1and2intensity', 'year','hasintensity']].join(toscountries)
In [168]:
tcs[['revt', 'pc1and2intensity', 'year','hasintensity']].head()
Out[168]:
In [169]:
X = tcs.fillna(0).values
# X = X[(np.abs(stats.zscore(X)) < 3).all(axis=1)]
In [199]:
# Compute DBSCAN
db = DBSCAN(eps=10.68, min_samples=3).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
In [220]:
tcs['labels'] = pd.Series(labels, index = tcs.index)
tcs.head()
Out[220]:
In [21]:
# Plot result
import matplotlib.pyplot as plt
In [201]:
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
In [202]:
plt.show()
In [225]:
tcsplot = tcs[['revt', 'pc1and2intensity', 'labels']]
In [226]:
g = tcsplot.groupby("labels")
xyvalues = dv.prep_groups(g)
In [229]:
tcscatter = dv.scatter_groups(xyvalues.values(), "targetintensity.html", "tcs", "revt", "change intensity")
# tcscatter.show()
In [232]:
show(tcscatter)
In [208]:
kmeans = KMeans(init='k-means++', n_clusters=4, n_init=10)
In [209]:
kmeans.fit(X)
Out[209]: