In [ ]:
%%HTML

<style type="text/css">
.input_prompt, .input_area, .output_prompt {
    display:none !important;
}
</style>

In [1]:
%%HTML
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>



In [1]:
#Always Pyspark first!
ErhvervsPath = "/home/svanhmic/workspace/DABAI"
parquetPath = "/home/svanhmic/workspace/data/DABAI/sparkdata/parquet"

from pyspark.sql import functions as F, Window, WindowSpec
from pyspark.sql import Row
from pyspark.sql.types import StringType,ArrayType,IntegerType,DoubleType,StructField,StructType,BooleanType
sc.addPyFile(ErhvervsPath+"/RegnSkabData/ImportRegnskabData.py")
sc.addPyFile(ErhvervsPath+'/RegnSkabData/RegnskabsClass.py')
sc.addPyFile(ErhvervsPath+'/ReadData/Fstat.py')
sc.addPyFile(ErhvervsPath+'/ReadData/GetNextJsonLayer.py')

import sys
import re
import os
import ImportRegnskabData
import GetNextJsonLayer
import itertools
import functools

%matplotlib inline
import seaborn as sb
import matplotlib.pyplot as plt
import numpy as np
import Fstat
import scipy as sp

import IPython
from IPython.display import display, Markdown, Latex
from pandas.tools.plotting import scatter_matrix

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:65% !important; }</style>"))
sb.set_context("talk")



In [2]:
def isText(col):
    
    #if the value contains letters
    
    try:
        matches = re.search(col,r'\w+ ^\d+',re.I)
        print(matches.group(0))
        return "string"
    except:
        #print(col)
        return "double"
    
textUdf = F.udf(lambda x: isText(x),StringType())

In [3]:
#input debt data
regnDf = (sqlContext
          .read
          .parquet(parquetPath+"/regnskaberDebt.parquet")
          .withColumn(colName="Dec",col=F.col("Dec").cast("integer"))
          .withColumn(colName="Prec",col=F.col("Prec").cast("integer"))
          .withColumn(colName="type",col=textUdf(F.col("Value")))
         )
#regnDf.printSchema()
regnDf.show(20,False)


+--------------------------------------------+---+----+----+-------+----------+----------------+----------+-----------+----------+-------------------------------------------------------+--------------+-----------------------------+------+
|Name                                        |Dec|Prec|Lang|unitRef|contextRef|EntityIdentifier|Start     |End_Instant|Value     |Dimensions                                             |originalLength|DebtNames                    |type  |
+--------------------------------------------+---+----+----+-------+----------+----------------+----------+-----------+----------+-------------------------------------------------------+--------------+-----------------------------+------+
|fsa:OtherLongtermDebtRaisedByIssuanceOfBonds|-3 |null|null|DKK    |16993085  |16993085        |null      |2014-09-30 |21,196,000|[cmn:ConsolidatedSoloDimension, cmn:ConsolidatedMember]|10            |DebtRaisedByIssuanceOfBonds  |double|
|fsa:ShorttermDebtToBanks                    |-3 |null|null|DKK    |16993085  |16993085        |null      |2014-09-30 |0         |[cmn:ConsolidatedSoloDimension, cmn:ConsolidatedMember]|1             |DebtToBanks                  |double|
|fsa:OtherLongtermDebtRaisedByIssuanceOfBonds|-3 |null|null|DKK    |16993085  |16993085        |null      |2013-09-30 |32,874,000|[cmn:ConsolidatedSoloDimension, cmn:ConsolidatedMember]|10            |DebtRaisedByIssuanceOfBonds  |double|
|fsa:ShorttermDebtToBanks                    |-3 |null|null|DKK    |16993085  |16993085        |null      |2013-09-30 |2,000     |[cmn:ConsolidatedSoloDimension, cmn:ConsolidatedMember]|5             |DebtToBanks                  |double|
|fsa:OtherLongtermDebtRaisedByIssuanceOfBonds|-3 |null|null|DKK    |16993085  |16993085        |null      |2014-09-30 |21,196,000|[]                                                     |10            |DebtRaisedByIssuanceOfBonds  |double|
|fsa:ShorttermDebtToBanks                    |-3 |null|null|DKK    |16993085  |16993085        |null      |2014-09-30 |0         |[]                                                     |1             |DebtToBanks                  |double|
|fsa:OtherLongtermDebtRaisedByIssuanceOfBonds|-3 |null|null|DKK    |16993085  |16993085        |null      |2013-09-30 |32,874,000|[]                                                     |10            |DebtRaisedByIssuanceOfBonds  |double|
|fsa:ShorttermDebtToBanks                    |-3 |null|null|DKK    |16993085  |16993085        |null      |2013-09-30 |2,000     |[]                                                     |5             |DebtToBanks                  |double|
|fsa:LongtermDebtToOtherCreditInstitutions   |0  |null|null|DKK    |31156270  |31156270        |null      |2014-12-31 |16,000,000|[cmn:ConsolidatedSoloDimension, cmn:ConsolidatedMember]|10            |DebtToOtherCreditInstitutions|double|
|fsa:LongtermDebtToOtherCreditInstitutions   |0  |null|null|DKK    |31156270  |31156270        |null      |2014-12-31 |0         |[]                                                     |1             |DebtToOtherCreditInstitutions|double|
|fsa:LongtermDebtToOtherCreditInstitutions   |0  |null|null|DKK    |31156270  |31156270        |null      |2015-12-31 |12,000,000|[cmn:ConsolidatedSoloDimension, cmn:ConsolidatedMember]|10            |DebtToOtherCreditInstitutions|double|
|fsa:LongtermDebtToOtherCreditInstitutions   |0  |null|null|DKK    |31156270  |31156270        |null      |2015-12-31 |0         |[]                                                     |1             |DebtToOtherCreditInstitutions|double|
|fsa:LongtermMortgageDebt                    |0  |null|null|DKK    |31156270  |31156270        |null      |2014-12-31 |5,597,785 |[cmn:ConsolidatedSoloDimension, cmn:ConsolidatedMember]|9             |Debt                         |double|
|fsa:LongtermMortgageDebt                    |0  |null|null|DKK    |31156270  |31156270        |null      |2014-12-31 |0         |[]                                                     |1             |Debt                         |double|
|fsa:LongtermMortgageDebt                    |0  |null|null|DKK    |31156270  |31156270        |null      |2015-12-31 |5,239,929 |[cmn:ConsolidatedSoloDimension, cmn:ConsolidatedMember]|9             |Debt                         |double|
|fsa:LongtermMortgageDebt                    |0  |null|null|DKK    |31156270  |31156270        |null      |2015-12-31 |0         |[]                                                     |1             |Debt                         |double|
|fsa:RepaymentOfDebtToCreditInstitutions     |0  |null|null|DKK    |31156270  |31156270        |2014-01-01|2014-12-31 |0         |[cmn:ConsolidatedSoloDimension, cmn:ConsolidatedMember]|1             |DebtToCreditInstitutions     |double|
|fsa:RepaymentOfDebtToCreditInstitutions     |0  |null|null|DKK    |31156270  |31156270        |2014-01-01|2014-12-31 |0         |[]                                                     |1             |DebtToCreditInstitutions     |double|
|fsa:RepaymentOfDebtToCreditInstitutions     |0  |null|null|DKK    |31156270  |31156270        |2015-01-01|2015-12-31 |4,339,990 |[cmn:ConsolidatedSoloDimension, cmn:ConsolidatedMember]|9             |DebtToCreditInstitutions     |double|
|fsa:RepaymentOfDebtToCreditInstitutions     |0  |null|null|DKK    |31156270  |31156270        |2015-01-01|2015-12-31 |0         |[]                                                     |1             |DebtToCreditInstitutions     |double|
+--------------------------------------------+---+----+----+-------+----------+----------------+----------+-----------+----------+-------------------------------------------------------+--------------+-----------------------------+------+
only showing top 20 rows

Hypoteser

  1. I hvor høj grad korrelerer gældsforpligtelser med vækst i virksomhederne? Der skal i den sammenhæng tages højde for forskellen i kortfristede og langfristede gældsforpligtelser.

In [4]:
display(Markdown("### Hvilke variable har vi at gøre godt med gæld at gøre i de ca. 600000 regnskaber, fra 2014-2016?"))
x = regnDf.select(F.regexp_replace("Name","\w+:|\w+-","").alias("Name")).groupBy("Name").count()
print("Der er i alt "+str(x.count()))
x.orderBy(F.col("count").desc()).limit(8).toPandas()#.show(8,False)


Hvilke variable har vi at gøre godt med gæld at gøre i de ca. 600000 regnskaber, fra 2014-2016?

Der er i alt 53
Out[4]:
Name count
0 ShorttermDebtToBanks 259774
1 LongtermMortgageDebt 184818
2 ShorttermDebtToOtherCreditInstitutions 138486
3 LongtermDebtToBanks 90381
4 LongtermDebtToOtherCreditInstitutions 57944
5 ShorttermMortgageDebt 49012
6 GainsLossesFromCurrentValueAdjustmentsOfDebtLi... 14204
7 ConvertibleProfitYieldingOrDividendYieldingLon... 6721

De 6 øverste bruges i denne analyse.


In [5]:
display(Markdown("Data fordelt på de 6 felter og år"))
filtstr = ""
cols = ["ShorttermDebtToBanks"
        ,"LongtermMortgageDebt"
        ,"ShorttermDebtToOtherCreditInstitutions"
        ,"LongtermDebtToBanks"
        ,"LongtermDebtToOtherCreditInstitutions"
        ,"ShorttermMortgageDebt"]

for i in cols[:-1]:
    filtstr += "( Name = '"+i + "') or "
print()
#filtstr+"(Name == '"+cols[-1]+"')"
(regnDf
 .select("EntityIdentifier",F.regexp_replace("Name","\w+:|\w+-","").alias("Name"),F.year("End_Instant").alias("year"))
 .filter(filtstr+"(Name = '"+cols[-1]+"')")
 .groupBy("Name","year")
 .count()
 .groupBy("Name")
 .pivot("year")
 .agg(F.max("count"))
 .toPandas()
 #.show(truncate=True)
)


Data fordelt på de 6 felter og år


Out[5]:
Name 0 12 13 1899 1900 2000 2004 2006 2008 2009 2010 2011 2012 2013 2014 2015 2016 2019
0 LongtermDebtToOtherCreditInstitutions NaN 1.0 NaN 31 NaN 2.0 NaN 3.0 NaN 1.0 2.0 45 7281 17115 19204 11912 2347 NaN
1 LongtermDebtToBanks 9.0 NaN NaN 125 1.0 NaN 2.0 15.0 3.0 3.0 1.0 39 10510 26229 30379 19380 3684 1.0
2 LongtermMortgageDebt NaN NaN 1.0 118 NaN 6.0 2.0 20.0 5.0 11.0 1.0 109 21826 52613 61523 40326 8257 NaN
3 ShorttermDebtToBanks NaN NaN 1.0 397 1.0 8.0 6.0 42.0 7.0 7.0 NaN 146 31018 75664 86632 54836 11008 1.0
4 ShorttermDebtToOtherCreditInstitutions NaN 1.0 NaN 34 NaN NaN NaN NaN NaN NaN 1.0 144 18452 41874 44809 27620 5551 NaN
5 ShorttermMortgageDebt NaN NaN NaN 11 NaN NaN NaN NaN NaN NaN 1.0 23 6372 14548 15601 10327 2129 NaN

In [6]:
def containsConsolidated(arr):
    
    for a in arr:
        pattern = re.search(pattern=r":Consolidated",string=a,flags=re.IGNORECASE)
        if pattern != "":
            return True
    return False

consolidatedUdf = F.udf(lambda x: containsConsolidated(x),BooleanType())

(regnDf
 .select("EntityIdentifier",F.regexp_replace("Name","\w+:|\w+-","").alias("Name"),F.year("End_Instant").alias("year"),consolidatedUdf("Dimensions").alias("containsDim"))
 .filter(F.col("containsDim") == False)
 .select("Name")
 .groupBy("Name")
 .count()
 .orderBy(F.col("count").desc())
 .show(100,truncate=False)
)


+-----------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|Name                                                                                                                                                       |count |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|ShorttermDebtToBanks                                                                                                                                       |248281|
|LongtermMortgageDebt                                                                                                                                       |174794|
|ShorttermDebtToOtherCreditInstitutions                                                                                                                     |132367|
|LongtermDebtToBanks                                                                                                                                        |86640 |
|LongtermDebtToOtherCreditInstitutions                                                                                                                      |53183 |
|ShorttermMortgageDebt                                                                                                                                      |46897 |
|GainsLossesFromCurrentValueAdjustmentsOfDebtLiabilitiesConcerningInvestmentProperty                                                                        |13893 |
|ConvertibleProfitYieldingOrDividendYieldingLongtermDebtInstruments                                                                                         |6361  |
|RepaymentOfDebtToGroupEnterprises                                                                                                                          |4886  |
|MortgageDebt                                                                                                                                               |4455  |
|RepaymentOfDebtToCreditInstitutions                                                                                                                        |4129  |
|OtherLongtermDebtRaisedByIssuanceOfBonds                                                                                                                   |3326  |
|RaisingOfDebtToGroupEnterprises                                                                                                                            |2881  |
|ShorttermDebtToBanksCashFlowsStatement                                                                                                                     |2794  |
|InformationOnOutstandingDebtFromManagementCategory                                                                                                         |2420  |
|DebtToOtherCreditInstitutions                                                                                                                              |2342  |
|RaisingOfDebtToCreditInstitutions                                                                                                                          |2226  |
|ConvertibleProfitYieldingOrDividendYieldingShorttermDebtInstruments                                                                                        |1873  |
|OtherShorttermDebtRaisedByIssuanceOfBonds                                                                                                                  |1663  |
|DebtToBanks                                                                                                                                                |1573  |
|RaisingOfLongtermDebt                                                                                                                                      |1176  |
|GainsLossesFromCurrentValueAdjustmentsOfDebtLiabilitiesConcerningOtherInvestmentAssets                                                                     |988   |
|RepaymentOfDebtToAssociates                                                                                                                                |743   |
|DebtReliefOrSimilarFinanceIncome                                                                                                                           |611   |
|GainsLossesFromCurrentValueAdjustmentsOfDebtLiabilities                                                                                                    |501   |
|RaisingOfDebtToAssociates                                                                                                                                  |443   |
|OutstandingDebtFromManagementCategory                                                                                                                      |437   |
|InformationOnDebtAgainstManagementCategoryRaisedAndRepaidDuringReportingPeriod                                                                             |363   |
|RepaymentOfMortgageDebt                                                                                                                                    |343   |
|DebtReliefOrSimilarFinanceExpenses                                                                                                                         |296   |
|InformationOnDebtAgainstGroupEnterprises                                                                                                                   |291   |
|DisclosureOfGainsLossesFromCurrentValueAdjustmentsOfDebtLiabilitiesConcerningInvestmentProperty                                                            |222   |
|DebtRepaidByManagementCategory                                                                                                                             |218   |
|RaisingOfMortgageDebt                                                                                                                                      |185   |
|DebtRaisedAndRepaidByManagementCategoryDuringReportingPeriod                                                                                               |174   |
|OtherDebtRaisedByIssuanceOfBonds                                                                                                                           |81    |
|InformationOnLoansRaisedAgainstIssueOfConvertibleDebtInstruments                                                                                           |76    |
|DisclosureOfLoansRaisedAgainstIssueOfConvertibleDebtInstrumentsBondsOrOtherInterestBearingInstrumentsAndWithdrawnClaimsOfCreditorForBenefitOfOtherCreditors|69    |
|ConvertibleProfitYieldingOrDividendYieldingDebtInstruments                                                                                                 |45    |
|InformationOnDebtAgainstAssociates                                                                                                                         |34    |
|DisclosureOfGainsLossesFromCurrentValueAdjustmentsOfDebtLiabilitiesConcerningOtherInvestmentAssets                                                         |25    |
|InterestRateRelatedToOutstandingDebtFromManagementCategory                                                                                                 |24    |
|OtherCashPaymentsToAcquireEquityOrDebtInstrumentsOfOtherEntitiesClassifiedAsInvestingActivities                                                            |18    |
|OutstandingDebtAgainstCreditor                                                                                                                             |10    |
|NoncurrentMortgageDebt                                                                                                                                     |8     |
|OtherShorttermDebts                                                                                                                                        |8     |
|MortgageDebtFromAcquisitionOfCompaniesAndActivitiesCashflow                                                                                                |8     |
|CurrentMortgageDebt                                                                                                                                        |8     |
|OtherCashReceiptsFromSalesOfEquityOrDebtInstrumentsOfOtherEntitiesClassifiedAsInvestingActivities                                                          |8     |
|DebtToBanksFromAcquisitionOfCompaniesAndActivitiesCashflow                                                                                                 |3     |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+------+

Det ses at data er mest koncenteret omkring 2012-2016.


In [7]:
#OK how many are acctual value columns?
valDf = regnDf.filter((F.col("type") == "double"))
cols = [i for i in regnDf.columns if i not in ("Name","Value")]
funcsCols = [F.regexp_replace("Name","\w+:|\w+-","").alias("Name") , F.regexp_replace("Value",",","").alias("Value")]+cols

valDf = (valDf
         .select(funcsCols)
         .filter((F.col("type")=="double"))
         .withColumn(col=F.col("Value").cast("double"),colName="Value")
         .withColumn(col=F.unix_timestamp(F.col("End_Instant")),colName="End_Instant_ts")
         #.drop("contextRef")
         .drop("Dimensions")
         .drop("Lang")
         .drop("DebtNames")
         .drop("originalLength")
         .distinct()
        )
#print(valDf.groupBy("Name").count().count())
#valDf.show()
#valDf.groupBy("Name").count().orderBy(F.col("Name").desc()).show(53,False)

In [8]:
#how many companies are the in this dataset?
nameCols = ["EntityIdentifier","Name",F.year("End_Instant").alias("End_Instant")]
originalNamesCols = ["EntityIdentifier","Name","End_Instant"]
namesDf = regnDf.select(*nameCols).groupBy(*originalNamesCols).count()
#namesDf.orderBy(nameCols[0],F.col("count").desc()).show()

In [9]:
#Load the kapitals in.
mainKapitalDf = (sqlContext
                 .read
                 .parquet(parquetPath+"/KaptialDataFrame.parquet")
                 .drop("KAPITALKLASSER_vaerdi")
                 .drop("KAPITAL_DELVIST_vaerdi")
                 .withColumn(col=F.coalesce(F.col("gyldigTil"),F.lit(F.current_date())),colName="gyldigTil")
                 .withColumn(col=F.datediff(F.col("GyldigTil"),F.col("gyldigFra")),colName="datediff")
                 .withColumn(col=F.col("KAPITAL_vaerdi").cast("double"),colName="KAPITAL_vaerdi")
                 .filter(F.year("gyldigFra") >= 2007)
                )
display(Markdown("### Hvordan ser kapital data ud? "))
display(Markdown("Kapital data fra 2007 og frem"))
mainKapitalDf.limit(2).toPandas()#.show(2)
#mainKapitalDf.printSchema()


---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
/usr/local/share/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:

/usr/local/share/spark/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    311                     "An error occurred while calling {0}{1}{2}.\n".
--> 312                     format(target_id, ".", name), value)
    313             else:

Py4JJavaError: An error occurred while calling o200.parquet.
: org.apache.spark.sql.AnalysisException: Path does not exist: file:/home/svanhmic/workspace/data/DABAI/sparkdata/parquet/KaptialDataFrame.parquet;
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:361)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:350)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
	at scala.collection.immutable.List.flatMap(List.scala:344)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:350)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:149)
	at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:427)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:211)
	at java.lang.Thread.run(Thread.java:745)


During handling of the above exception, another exception occurred:

AnalysisException                         Traceback (most recent call last)
<ipython-input-9-b1011a34f006> in <module>()
      2 mainKapitalDf = (sqlContext
      3                  .read
----> 4                  .parquet(parquetPath+"/KaptialDataFrame.parquet")
      5                  .drop("KAPITALKLASSER_vaerdi")
      6                  .drop("KAPITAL_DELVIST_vaerdi")

/usr/local/share/spark/python/pyspark/sql/readwriter.py in parquet(self, *paths)
    260         [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
    261         """
--> 262         return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
    263 
    264     @ignore_unicode_prefix

/usr/local/share/spark/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
    931         answer = self.gateway_client.send_command(command)
    932         return_value = get_return_value(
--> 933             answer, self.gateway_client, self.target_id, self.name)
    934 
    935         for temp_arg in temp_args:

/usr/local/share/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     67                                              e.java_exception.getStackTrace()))
     68             if s.startswith('org.apache.spark.sql.AnalysisException: '):
---> 69                 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
     70             if s.startswith('org.apache.spark.sql.catalyst.analysis'):
     71                 raise AnalysisException(s.split(': ', 1)[1], stackTrace)

AnalysisException: 'Path does not exist: file:/home/svanhmic/workspace/data/DABAI/sparkdata/parquet/KaptialDataFrame.parquet;'
  • Kapital postninger er ikke angivet regelmæssigt. Derfor sættes alle måneder og år til samme kapital værdi hvis uændret.

In [ ]:
joinsOn = ( (mainKapitalDf["cvrNummer"] == valDf["EntityIdentifier"] ) 
           & (valDf["End_Instant_ts"].between(mainKapitalDf["timeStampFra"],mainKapitalDf["timeStampTil"])) 
           & (F.year(valDf["End_Instant"]) == mainKapitalDf["aar"])
          )

NamesToUse = ["ShorttermDebtToBanks"
              ,"LongtermMortgageDebt"
              ,"ShorttermDebtToOtherCreditInstitutions"
              ,"LongtermDebtToBanks"
              ,"LongtermDebtToOtherCreditInstitutions"
              ,"ShorttermMortgageDebt"
              ,"ShorttermDebtToBanksCashFlowsStatement"
              ,"OtherLongtermDebtRaisedByIssuanceOfBonds"
             ]


notPivotCols = ["aar","cvrNummer","End_Instant","End_Instant_ts","KAPITAL_vaerdi"]
selectedCols = ["aar","cvrNummer","Name","KAPITAL_vaerdi","KAPITALVALUTA_vaerdi","Value","End_Instant","End_Instant_ts"]
filterdCols = ["Name == '"+str(i)+"'" for i in NamesToUse]
combinedCols = notPivotCols+[F.col(col+".Value").alias(col) for i,col in enumerate(NamesToUse)]


filtersStr = "( "
for i in filterdCols[:-1]:
    filtersStr += i+" | "
    
filtersStr += filterdCols[-1]+")"
#print(filtersStr)
joinedDf = (mainKapitalDf
            .drop("ts")
            .drop("maaned") # this gives all years for all companies that has a registered kapital
            .distinct()
            .join(other=valDf,on=joinsOn,how="left")
            #.filter(filtersStr)
            .select(*selectedCols)
            .groupBy(*selectedCols)
            .count()
            .groupBy(*notPivotCols)
            .pivot("Name",NamesToUse)
            .agg(F.max(F.struct([F.col("count"),F.col("Value")])))
            .select(*combinedCols)
            .cache() #nice to have, when plotting stuff.
           )

In [ ]:
joinedDf.orderBy("cvrNummer").show(10,False)

In [ ]:
#not scaled so depricated
scatterKapDf = joinedDf.select(["cvrNummer","aar","KAPITAL_vaerdi"]+NamesToUse[:4])

scatterDescribeDf = scatterKapDf.drop("cvrNummer").drop("aar").describe()
#scatterDescribeDf.show()

#scatterKapDf.count()
#axes = scatter_matrix(scatterKapDf.drop("cvrNummer").drop("aar").toPandas(),alpha=0.5,figsize=[30,30])

#[plt.setp(item.yaxis.get_majorticklabels(), 'size', 15) for item in axes.ravel()]
#x ticklabels
#[plt.setp(item.xaxis.get_majorticklabels(), 'size', 15) for item in axes.ravel()]
#[plt.setp(item.yaxis.get_label(), 'size', 20) for item in axes.ravel()]
#x labels
#[plt.setp(item.xaxis.get_label(), 'size', 20) for item in axes.ravel()]


#print(scatterKapDf.count())
#plt.show()

In [ ]:
#OKAY lets scale with mean and std and take log1p of the scaled stuff

description = (scatterDescribeDf
               .filter( (F.col("summary") == "mean")|(F.col("summary") == "stddev") )
               .rdd
               .map(lambda x: (x["summary"],x.asDict())).collectAsMap())
#print(description)
describBroadCast = sc.broadcast(description)

cols = [F.log1p((F.col(i)-F.lit(describBroadCast.value["mean"][i]))/F.lit(describBroadCast.value["stddev"][i])).alias(i) for i in ["KAPITAL_vaerdi"]+NamesToUse[:4]]

scaledScatterKapDf = scatterKapDf.select(*cols,"aar")
scatterDescribeDf = scaledScatterKapDf.describe()


scaledScatterKapDf.count()
axes = scatter_matrix(scaledScatterKapDf.drop("aar").toPandas(),alpha=0.5,figsize=[9,9])
#axes = scatter_matrix(scaledScatterKapDf.filter(F.col("aar") == 2012).drop("aar").toPandas(),alpha=0.5,figsize=[9,9])

[plt.setp(item.yaxis.get_majorticklabels(), 'size', 7) for item in axes.ravel()]
#x ticklabels
[plt.setp(item.xaxis.get_majorticklabels(), 'size', 7) for item in axes.ravel()]
[plt.setp(item.yaxis.get_label(), 'size', 7) for item in axes.ravel()]
#x labels
[plt.setp(item.xaxis.get_label(), 'size', 7) for item in axes.ravel()]

display(Markdown("### Hvordan er sammenhængen mellem kapitalforhøjelser og de 6 former for gæld?"))
#plt.title("Kapitalvaerdier mod de 6 former for gæld i regnskabsdata")
plt.show()

In [ ]:
#OK what's the correlation plot for 

corr = scaledScatterKapDf.drop("aar").toPandas().corr()
#corr = scaledScatterKapDf.filter(F.col("aar") == 2012).drop("aar").toPandas().corr()


cmap = sb.diverging_palette(220, 10, as_cmap=True)
sb.heatmap(corr, cmap=cmap,annot=True)
display(Markdown("### Korrelation i mellem Kapital og gældsformer"))
plt.title("Den egentlige korrelation imellem kapital og diverse former for gæld")
plt.show()
display(Markdown("OK der er ikke den store sammenhæng mellem kapital og gæld, men hvad nu hvis vi forskyder gæld. "
))

#so in initial conclusion, we can't see that much for when debt is in the same year as kapital. But what about when we shift kapital years bac

In [ ]:
def skewDebt(df,cols,years=1):
    "this is still too messy!"
    
    skeewWindow = (Window.partitionBy("cvrNummer").orderBy("aar"))

    skeewedDf =  (df
            .withColumn(col=F.lag(F.struct(*cols[:4]),count=years).over(skeewWindow),colName="oneYearLag")
            .select(["cvrNummer","aar","KAPITAL_vaerdi"]+[F.col("oneYearLag."+str(i)) for i in cols[:4]])
           )
    
    
    descr = (skeewedDf.describe()
               .filter( (F.col("summary") == "mean")|(F.col("summary") == "stddev") )
               .rdd
               .map(lambda x: (x["summary"],x.asDict())).collectAsMap())
    
    scaleCols = [F.log1p((F.col(i)-F.lit(descr["mean"][i]))/F.lit(descr["stddev"][i])).alias(i) for i in ["KAPITAL_vaerdi"]+cols[:4]]

    return (scatterKapDf.select(*scaleCols))
#skeewedDf.printSchema()

In [ ]:
skeewedDf1 = skewDebt(scatterKapDf,NamesToUse,1)
skeewedDf2 = skewDebt(scatterKapDf,NamesToUse,2)
skeewedDf3  = skewDebt(scatterKapDf,NamesToUse,3)
skeewedDf4  = skewDebt(scatterKapDf,NamesToUse,4)

In [ ]:
#skeewedDf1.printSchema()

In [ ]:
skeewArr = [skeewedDf1,skeewedDf2,skeewedDf3,skeewedDf4]
#skeewArr = [i.filter(F.col("aar")==2012).drop("aar") for i in [skeewedDf1,skeewedDf2,skeewedDf3,skeewedDf4]]

In [ ]:
# a heatmap of the correlation between the different variables:

# Compute the correlation matrix

plotLen = len(skeewArr)
fig, axes = plt.subplots(1,2,figsize=(25,10))
cmap = sb.diverging_palette(220, 10, as_cmap=True)
years = [1,2,3,4]
sb.set(font_scale=1.0)

ax0 = axes[0]
ax1 = axes[1]
ax0.set_title("Gældstyper sammenlignet med kapital fra 1 år siden.")
ax1.set_title("Gældstyper sammenlignet med kapital fra 2 år siden.")

sb.heatmap(skeewArr[0].na.drop("all").toPandas().corr(), cmap=cmap,annot=True,ax=ax0)
sb.heatmap(skeewArr[1].na.drop("all").toPandas().corr(), cmap=cmap,annot=True,ax=ax1)
    #scatter_matrix(data,alpha=0.5,ax=axes[x,y])
#plt.title("correlation between kapital and various forms of debt short and long term")
display(Markdown("### Sammenligning mellem gæld og forskudt kapital"))
#[plt.setp(item, 'rotation', 15) for item in axes.ravel()]

plt.show()

In [ ]:
plotLen = len(skeewArr)
fig, axes = plt.subplots(1,2,figsize=(25,10))
cmap = sb.diverging_palette(220, 10, as_cmap=True)
axes[0].set_title("Gældstyper sammenlignet med kapital fra 3 år siden.")
axes[1].set_title("Gældstyper sammenlignet med kapital fra 4 år siden.")
sb.heatmap(skeewArr[2].na.drop("all").toPandas().corr(), cmap=cmap,annot=True,ax=axes[0])
sb.heatmap(skeewArr[3].na.drop("all").toPandas().corr(), cmap=cmap,annot=True,ax=axes[1])
    #scatter_matrix(data,alpha=0.5,ax=axes[x,y])
#plt.title("correlation between kapital and various forms of debt short and long term")
display(Markdown("### Sammenligning mellem gæld og forskudt kapital"))
plt.show()

Opsumering

*


In [ ]: