In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import functions as F
from functools import reduce
from pyspark.sql.window import Window
#Python imports
import sys
In [4]:
#spark = SparkSession(sc).builder.master("local[*]").appName("TestingCvr").getOrCreate()
conf = sc.getConf()
conf.setAppName("TestingCvr")
print(conf.getAll())
sqlContext = SQLContext(sc)
[('hive.metastore.warehouse.dir', 'file:/home/svanhmic/workspace/Python/Erhvervs/src/notebooks/cvr/spark-warehouse'), ('spark.sql.catalogImplementation', 'hive'), ('spark.rdd.compress', 'True'), ('spark.app.name', 'TestingCvr'), ('spark.driver.host', '10.52.1.5'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.executor.id', 'driver'), ('spark.submit.deployMode', 'client'), ('spark.driver.port', '35356'), ('spark.driver.memory', '6G'), ('spark.app.id', 'local-1483957703077')]
In [5]:
if __name__ == '__main__':
cvrData = "/home/svanhmic/workspace/Python/Erhvervs/data/cdata/virksomhedersMetadata.json/"
allCVRData = "/home/svanhmic/workspace/Python/Erhvervs/data/cdata/AlleDatavirksomheder.json/"
cvrDf = sqlContext.read.json(allCVRData)
#
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-5-b36f50d0690d> in <module>()
2 cvrData = "/home/svanhmic/workspace/Python/Erhvervs/data/cdata/virksomhedersMetadata.json/"
3 allCVRData = "/home/svanhmic/workspace/Python/Erhvervs/data/cdata/AlleDatavirksomheder.json/"
----> 4 cvrDf = sqlContext.read.json(allCVRData)
5 #
/usr/local/share/spark/python/pyspark/sql/readwriter.py in json(self, path, schema, primitivesAsString, prefersDecimal, allowComments, allowUnquotedFieldNames, allowSingleQuotes, allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, mode, columnNameOfCorruptRecord)
218 path = [path]
219 if type(path) == list:
--> 220 return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))
221 elif isinstance(path, RDD):
222 def func(iterator):
/usr/local/share/spark/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
931 answer = self.gateway_client.send_command(command)
932 return_value = get_return_value(
--> 933 answer, self.gateway_client, self.target_id, self.name)
934
935 for temp_arg in temp_args:
/usr/local/share/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
/usr/local/share/spark/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
310 raise Py4JJavaError(
311 "An error occurred while calling {0}{1}{2}.\n".
--> 312 format(target_id, ".", name), value)
313 else:
314 raise Py4JError(
Py4JJavaError: An error occurred while calling o151.json.
: java.lang.RuntimeException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
at org.apache.spark.sql.hive.client.HiveClientImpl.<init>(HiveClientImpl.scala:171)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:258)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:359)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:263)
at org.apache.spark.sql.hive.HiveSharedState.metadataHive$lzycompute(HiveSharedState.scala:39)
at org.apache.spark.sql.hive.HiveSharedState.metadataHive(HiveSharedState.scala:38)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog$lzycompute(HiveSharedState.scala:46)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog(HiveSharedState.scala:45)
at org.apache.spark.sql.hive.HiveSessionState.catalog$lzycompute(HiveSessionState.scala:50)
at org.apache.spark.sql.hive.HiveSessionState.catalog(HiveSessionState.scala:48)
at org.apache.spark.sql.hive.HiveSessionState$$anon$1.<init>(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer$lzycompute(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer(HiveSessionState.scala:62)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
at org.apache.spark.sql.SparkSession.baseRelationToDataFrame(SparkSession.scala:382)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:143)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:287)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:211)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1523)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.<init>(RetryingMetaStoreClient.java:86)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:132)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:104)
at org.apache.hadoop.hive.ql.metadata.Hive.createMetaStoreClient(Hive.java:3005)
at org.apache.hadoop.hive.ql.metadata.Hive.getMSC(Hive.java:3024)
at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:503)
... 33 more
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1521)
... 39 more
Caused by: javax.jdo.JDOFatalDataStoreException: Unable to open a test connection to the given database. JDBC url = jdbc:derby:;databaseName=metastore_db;create=true, username = APP. Terminating connection pool (set lazyInit to true if you expect to start your database after your app). Original Exception: ------
java.sql.SQLException: Failed to start database 'metastore_db' with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@7a1ece89, see the next exception for details.
at org.apache.derby.impl.jdbc.SQLExceptionFactory.getSQLException(Unknown Source)
at org.apache.derby.impl.jdbc.SQLExceptionFactory.getSQLException(Unknown Source)
at org.apache.derby.impl.jdbc.Util.seeNextException(Unknown Source)
at org.apache.derby.impl.jdbc.EmbedConnection.bootDatabase(Unknown Source)
at org.apache.derby.impl.jdbc.EmbedConnection.<init>(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.getNewEmbedConnection(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.connect(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.connect(Unknown Source)
at org.apache.derby.jdbc.AutoloadedDriver.connect(Unknown Source)
at java.sql.DriverManager.getConnection(DriverManager.java:664)
at java.sql.DriverManager.getConnection(DriverManager.java:208)
at com.jolbox.bonecp.BoneCP.obtainRawInternalConnection(BoneCP.java:361)
at com.jolbox.bonecp.BoneCP.<init>(BoneCP.java:416)
at com.jolbox.bonecp.BoneCPDataSource.getConnection(BoneCPDataSource.java:120)
at org.datanucleus.store.rdbms.ConnectionFactoryImpl$ManagedConnectionImpl.getConnection(ConnectionFactoryImpl.java:501)
at org.datanucleus.store.rdbms.RDBMSStoreManager.<init>(RDBMSStoreManager.java:298)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.datanucleus.plugin.NonManagedPluginRegistry.createExecutableExtension(NonManagedPluginRegistry.java:631)
at org.datanucleus.plugin.PluginManager.createExecutableExtension(PluginManager.java:301)
at org.datanucleus.NucleusContext.createStoreManagerForProperties(NucleusContext.java:1187)
at org.datanucleus.NucleusContext.initialise(NucleusContext.java:356)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.freezeConfiguration(JDOPersistenceManagerFactory.java:775)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.createPersistenceManagerFactory(JDOPersistenceManagerFactory.java:333)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.getPersistenceManagerFactory(JDOPersistenceManagerFactory.java:202)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at javax.jdo.JDOHelper$16.run(JDOHelper.java:1965)
at java.security.AccessController.doPrivileged(Native Method)
at javax.jdo.JDOHelper.invoke(JDOHelper.java:1960)
at javax.jdo.JDOHelper.invokeGetPersistenceManagerFactoryOnImplementation(JDOHelper.java:1166)
at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:808)
at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:701)
at org.apache.hadoop.hive.metastore.ObjectStore.getPMF(ObjectStore.java:365)
at org.apache.hadoop.hive.metastore.ObjectStore.getPersistenceManager(ObjectStore.java:394)
at org.apache.hadoop.hive.metastore.ObjectStore.initialize(ObjectStore.java:291)
at org.apache.hadoop.hive.metastore.ObjectStore.setConf(ObjectStore.java:258)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:76)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:136)
at org.apache.hadoop.hive.metastore.RawStoreProxy.<init>(RawStoreProxy.java:57)
at org.apache.hadoop.hive.metastore.RawStoreProxy.getProxy(RawStoreProxy.java:66)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newRawStore(HiveMetaStore.java:593)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.getMS(HiveMetaStore.java:571)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.createDefaultDB(HiveMetaStore.java:624)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.init(HiveMetaStore.java:461)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.<init>(RetryingHMSHandler.java:66)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.getProxy(RetryingHMSHandler.java:72)
at org.apache.hadoop.hive.metastore.HiveMetaStore.newRetryingHMSHandler(HiveMetaStore.java:5762)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.<init>(HiveMetaStoreClient.java:199)
at org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient.<init>(SessionHiveMetaStoreClient.java:74)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1521)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.<init>(RetryingMetaStoreClient.java:86)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:132)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:104)
at org.apache.hadoop.hive.ql.metadata.Hive.createMetaStoreClient(Hive.java:3005)
at org.apache.hadoop.hive.ql.metadata.Hive.getMSC(Hive.java:3024)
at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:503)
at org.apache.spark.sql.hive.client.HiveClientImpl.<init>(HiveClientImpl.scala:171)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:258)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:359)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:263)
at org.apache.spark.sql.hive.HiveSharedState.metadataHive$lzycompute(HiveSharedState.scala:39)
at org.apache.spark.sql.hive.HiveSharedState.metadataHive(HiveSharedState.scala:38)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog$lzycompute(HiveSharedState.scala:46)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog(HiveSharedState.scala:45)
at org.apache.spark.sql.hive.HiveSessionState.catalog$lzycompute(HiveSessionState.scala:50)
at org.apache.spark.sql.hive.HiveSessionState.catalog(HiveSessionState.scala:48)
at org.apache.spark.sql.hive.HiveSessionState$$anon$1.<init>(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer$lzycompute(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer(HiveSessionState.scala:62)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
at org.apache.spark.sql.SparkSession.baseRelationToDataFrame(SparkSession.scala:382)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:143)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:287)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:211)
at java.lang.Thread.run(Thread.java:745)
Caused by: ERROR XJ040: Failed to start database 'metastore_db' with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@7a1ece89, see the next exception for details.
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.impl.jdbc.SQLExceptionFactory.wrapArgsForTransportAcrossDRDA(Unknown Source)
... 98 more
Caused by: ERROR XSDB6: Another instance of Derby may have already booted the database /home/svanhmic/workspace/Python/Erhvervs/src/notebooks/cvr/metastore_db.
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.privGetJBMSLockOnDB(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.run(Unknown Source)
at java.security.AccessController.doPrivileged(Native Method)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.getJBMSLockOnDB(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.store.raw.RawStore.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.store.access.RAMAccessManager.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.db.BasicDatabase.bootStore(Unknown Source)
at org.apache.derby.impl.db.BasicDatabase.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.bootService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startProviderService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.findProviderAndStartService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startPersistentService(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.startPersistentService(Unknown Source)
... 95 more
------
NestedThrowables:
java.sql.SQLException: Unable to open a test connection to the given database. JDBC url = jdbc:derby:;databaseName=metastore_db;create=true, username = APP. Terminating connection pool (set lazyInit to true if you expect to start your database after your app). Original Exception: ------
java.sql.SQLException: Failed to start database 'metastore_db' with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@7a1ece89, see the next exception for details.
at org.apache.derby.impl.jdbc.SQLExceptionFactory.getSQLException(Unknown Source)
at org.apache.derby.impl.jdbc.SQLExceptionFactory.getSQLException(Unknown Source)
at org.apache.derby.impl.jdbc.Util.seeNextException(Unknown Source)
at org.apache.derby.impl.jdbc.EmbedConnection.bootDatabase(Unknown Source)
at org.apache.derby.impl.jdbc.EmbedConnection.<init>(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.getNewEmbedConnection(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.connect(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.connect(Unknown Source)
at org.apache.derby.jdbc.AutoloadedDriver.connect(Unknown Source)
at java.sql.DriverManager.getConnection(DriverManager.java:664)
at java.sql.DriverManager.getConnection(DriverManager.java:208)
at com.jolbox.bonecp.BoneCP.obtainRawInternalConnection(BoneCP.java:361)
at com.jolbox.bonecp.BoneCP.<init>(BoneCP.java:416)
at com.jolbox.bonecp.BoneCPDataSource.getConnection(BoneCPDataSource.java:120)
at org.datanucleus.store.rdbms.ConnectionFactoryImpl$ManagedConnectionImpl.getConnection(ConnectionFactoryImpl.java:501)
at org.datanucleus.store.rdbms.RDBMSStoreManager.<init>(RDBMSStoreManager.java:298)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.datanucleus.plugin.NonManagedPluginRegistry.createExecutableExtension(NonManagedPluginRegistry.java:631)
at org.datanucleus.plugin.PluginManager.createExecutableExtension(PluginManager.java:301)
at org.datanucleus.NucleusContext.createStoreManagerForProperties(NucleusContext.java:1187)
at org.datanucleus.NucleusContext.initialise(NucleusContext.java:356)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.freezeConfiguration(JDOPersistenceManagerFactory.java:775)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.createPersistenceManagerFactory(JDOPersistenceManagerFactory.java:333)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.getPersistenceManagerFactory(JDOPersistenceManagerFactory.java:202)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at javax.jdo.JDOHelper$16.run(JDOHelper.java:1965)
at java.security.AccessController.doPrivileged(Native Method)
at javax.jdo.JDOHelper.invoke(JDOHelper.java:1960)
at javax.jdo.JDOHelper.invokeGetPersistenceManagerFactoryOnImplementation(JDOHelper.java:1166)
at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:808)
at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:701)
at org.apache.hadoop.hive.metastore.ObjectStore.getPMF(ObjectStore.java:365)
at org.apache.hadoop.hive.metastore.ObjectStore.getPersistenceManager(ObjectStore.java:394)
at org.apache.hadoop.hive.metastore.ObjectStore.initialize(ObjectStore.java:291)
at org.apache.hadoop.hive.metastore.ObjectStore.setConf(ObjectStore.java:258)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:76)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:136)
at org.apache.hadoop.hive.metastore.RawStoreProxy.<init>(RawStoreProxy.java:57)
at org.apache.hadoop.hive.metastore.RawStoreProxy.getProxy(RawStoreProxy.java:66)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newRawStore(HiveMetaStore.java:593)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.getMS(HiveMetaStore.java:571)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.createDefaultDB(HiveMetaStore.java:624)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.init(HiveMetaStore.java:461)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.<init>(RetryingHMSHandler.java:66)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.getProxy(RetryingHMSHandler.java:72)
at org.apache.hadoop.hive.metastore.HiveMetaStore.newRetryingHMSHandler(HiveMetaStore.java:5762)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.<init>(HiveMetaStoreClient.java:199)
at org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient.<init>(SessionHiveMetaStoreClient.java:74)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1521)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.<init>(RetryingMetaStoreClient.java:86)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:132)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:104)
at org.apache.hadoop.hive.ql.metadata.Hive.createMetaStoreClient(Hive.java:3005)
at org.apache.hadoop.hive.ql.metadata.Hive.getMSC(Hive.java:3024)
at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:503)
at org.apache.spark.sql.hive.client.HiveClientImpl.<init>(HiveClientImpl.scala:171)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:258)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:359)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:263)
at org.apache.spark.sql.hive.HiveSharedState.metadataHive$lzycompute(HiveSharedState.scala:39)
at org.apache.spark.sql.hive.HiveSharedState.metadataHive(HiveSharedState.scala:38)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog$lzycompute(HiveSharedState.scala:46)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog(HiveSharedState.scala:45)
at org.apache.spark.sql.hive.HiveSessionState.catalog$lzycompute(HiveSessionState.scala:50)
at org.apache.spark.sql.hive.HiveSessionState.catalog(HiveSessionState.scala:48)
at org.apache.spark.sql.hive.HiveSessionState$$anon$1.<init>(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer$lzycompute(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer(HiveSessionState.scala:62)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
at org.apache.spark.sql.SparkSession.baseRelationToDataFrame(SparkSession.scala:382)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:143)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:287)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:211)
at java.lang.Thread.run(Thread.java:745)
Caused by: ERROR XJ040: Failed to start database 'metastore_db' with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@7a1ece89, see the next exception for details.
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.impl.jdbc.SQLExceptionFactory.wrapArgsForTransportAcrossDRDA(Unknown Source)
... 98 more
Caused by: ERROR XSDB6: Another instance of Derby may have already booted the database /home/svanhmic/workspace/Python/Erhvervs/src/notebooks/cvr/metastore_db.
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.privGetJBMSLockOnDB(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.run(Unknown Source)
at java.security.AccessController.doPrivileged(Native Method)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.getJBMSLockOnDB(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.store.raw.RawStore.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.store.access.RAMAccessManager.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.db.BasicDatabase.bootStore(Unknown Source)
at org.apache.derby.impl.db.BasicDatabase.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.bootService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startProviderService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.findProviderAndStartService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startPersistentService(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.startPersistentService(Unknown Source)
... 95 more
------
at org.datanucleus.api.jdo.NucleusJDOHelper.getJDOExceptionForNucleusException(NucleusJDOHelper.java:436)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.freezeConfiguration(JDOPersistenceManagerFactory.java:788)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.createPersistenceManagerFactory(JDOPersistenceManagerFactory.java:333)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.getPersistenceManagerFactory(JDOPersistenceManagerFactory.java:202)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at javax.jdo.JDOHelper$16.run(JDOHelper.java:1965)
at java.security.AccessController.doPrivileged(Native Method)
at javax.jdo.JDOHelper.invoke(JDOHelper.java:1960)
at javax.jdo.JDOHelper.invokeGetPersistenceManagerFactoryOnImplementation(JDOHelper.java:1166)
at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:808)
at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:701)
at org.apache.hadoop.hive.metastore.ObjectStore.getPMF(ObjectStore.java:365)
at org.apache.hadoop.hive.metastore.ObjectStore.getPersistenceManager(ObjectStore.java:394)
at org.apache.hadoop.hive.metastore.ObjectStore.initialize(ObjectStore.java:291)
at org.apache.hadoop.hive.metastore.ObjectStore.setConf(ObjectStore.java:258)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:76)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:136)
at org.apache.hadoop.hive.metastore.RawStoreProxy.<init>(RawStoreProxy.java:57)
at org.apache.hadoop.hive.metastore.RawStoreProxy.getProxy(RawStoreProxy.java:66)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newRawStore(HiveMetaStore.java:593)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.getMS(HiveMetaStore.java:571)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.createDefaultDB(HiveMetaStore.java:624)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.init(HiveMetaStore.java:461)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.<init>(RetryingHMSHandler.java:66)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.getProxy(RetryingHMSHandler.java:72)
at org.apache.hadoop.hive.metastore.HiveMetaStore.newRetryingHMSHandler(HiveMetaStore.java:5762)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.<init>(HiveMetaStoreClient.java:199)
at org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient.<init>(SessionHiveMetaStoreClient.java:74)
... 44 more
Caused by: java.sql.SQLException: Unable to open a test connection to the given database. JDBC url = jdbc:derby:;databaseName=metastore_db;create=true, username = APP. Terminating connection pool (set lazyInit to true if you expect to start your database after your app). Original Exception: ------
java.sql.SQLException: Failed to start database 'metastore_db' with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@7a1ece89, see the next exception for details.
at org.apache.derby.impl.jdbc.SQLExceptionFactory.getSQLException(Unknown Source)
at org.apache.derby.impl.jdbc.SQLExceptionFactory.getSQLException(Unknown Source)
at org.apache.derby.impl.jdbc.Util.seeNextException(Unknown Source)
at org.apache.derby.impl.jdbc.EmbedConnection.bootDatabase(Unknown Source)
at org.apache.derby.impl.jdbc.EmbedConnection.<init>(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.getNewEmbedConnection(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.connect(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.connect(Unknown Source)
at org.apache.derby.jdbc.AutoloadedDriver.connect(Unknown Source)
at java.sql.DriverManager.getConnection(DriverManager.java:664)
at java.sql.DriverManager.getConnection(DriverManager.java:208)
at com.jolbox.bonecp.BoneCP.obtainRawInternalConnection(BoneCP.java:361)
at com.jolbox.bonecp.BoneCP.<init>(BoneCP.java:416)
at com.jolbox.bonecp.BoneCPDataSource.getConnection(BoneCPDataSource.java:120)
at org.datanucleus.store.rdbms.ConnectionFactoryImpl$ManagedConnectionImpl.getConnection(ConnectionFactoryImpl.java:501)
at org.datanucleus.store.rdbms.RDBMSStoreManager.<init>(RDBMSStoreManager.java:298)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.datanucleus.plugin.NonManagedPluginRegistry.createExecutableExtension(NonManagedPluginRegistry.java:631)
at org.datanucleus.plugin.PluginManager.createExecutableExtension(PluginManager.java:301)
at org.datanucleus.NucleusContext.createStoreManagerForProperties(NucleusContext.java:1187)
at org.datanucleus.NucleusContext.initialise(NucleusContext.java:356)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.freezeConfiguration(JDOPersistenceManagerFactory.java:775)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.createPersistenceManagerFactory(JDOPersistenceManagerFactory.java:333)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.getPersistenceManagerFactory(JDOPersistenceManagerFactory.java:202)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at javax.jdo.JDOHelper$16.run(JDOHelper.java:1965)
at java.security.AccessController.doPrivileged(Native Method)
at javax.jdo.JDOHelper.invoke(JDOHelper.java:1960)
at javax.jdo.JDOHelper.invokeGetPersistenceManagerFactoryOnImplementation(JDOHelper.java:1166)
at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:808)
at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:701)
at org.apache.hadoop.hive.metastore.ObjectStore.getPMF(ObjectStore.java:365)
at org.apache.hadoop.hive.metastore.ObjectStore.getPersistenceManager(ObjectStore.java:394)
at org.apache.hadoop.hive.metastore.ObjectStore.initialize(ObjectStore.java:291)
at org.apache.hadoop.hive.metastore.ObjectStore.setConf(ObjectStore.java:258)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:76)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:136)
at org.apache.hadoop.hive.metastore.RawStoreProxy.<init>(RawStoreProxy.java:57)
at org.apache.hadoop.hive.metastore.RawStoreProxy.getProxy(RawStoreProxy.java:66)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newRawStore(HiveMetaStore.java:593)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.getMS(HiveMetaStore.java:571)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.createDefaultDB(HiveMetaStore.java:624)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.init(HiveMetaStore.java:461)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.<init>(RetryingHMSHandler.java:66)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.getProxy(RetryingHMSHandler.java:72)
at org.apache.hadoop.hive.metastore.HiveMetaStore.newRetryingHMSHandler(HiveMetaStore.java:5762)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.<init>(HiveMetaStoreClient.java:199)
at org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient.<init>(SessionHiveMetaStoreClient.java:74)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1521)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.<init>(RetryingMetaStoreClient.java:86)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:132)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:104)
at org.apache.hadoop.hive.ql.metadata.Hive.createMetaStoreClient(Hive.java:3005)
at org.apache.hadoop.hive.ql.metadata.Hive.getMSC(Hive.java:3024)
at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:503)
at org.apache.spark.sql.hive.client.HiveClientImpl.<init>(HiveClientImpl.scala:171)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:258)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:359)
at org.apache.spark.sql.hive.HiveUtils$.newClientForMetadata(HiveUtils.scala:263)
at org.apache.spark.sql.hive.HiveSharedState.metadataHive$lzycompute(HiveSharedState.scala:39)
at org.apache.spark.sql.hive.HiveSharedState.metadataHive(HiveSharedState.scala:38)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog$lzycompute(HiveSharedState.scala:46)
at org.apache.spark.sql.hive.HiveSharedState.externalCatalog(HiveSharedState.scala:45)
at org.apache.spark.sql.hive.HiveSessionState.catalog$lzycompute(HiveSessionState.scala:50)
at org.apache.spark.sql.hive.HiveSessionState.catalog(HiveSessionState.scala:48)
at org.apache.spark.sql.hive.HiveSessionState$$anon$1.<init>(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer$lzycompute(HiveSessionState.scala:63)
at org.apache.spark.sql.hive.HiveSessionState.analyzer(HiveSessionState.scala:62)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
at org.apache.spark.sql.SparkSession.baseRelationToDataFrame(SparkSession.scala:382)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:143)
at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:287)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:211)
at java.lang.Thread.run(Thread.java:745)
Caused by: ERROR XJ040: Failed to start database 'metastore_db' with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@7a1ece89, see the next exception for details.
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.impl.jdbc.SQLExceptionFactory.wrapArgsForTransportAcrossDRDA(Unknown Source)
... 98 more
Caused by: ERROR XSDB6: Another instance of Derby may have already booted the database /home/svanhmic/workspace/Python/Erhvervs/src/notebooks/cvr/metastore_db.
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.privGetJBMSLockOnDB(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.run(Unknown Source)
at java.security.AccessController.doPrivileged(Native Method)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.getJBMSLockOnDB(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.store.raw.RawStore.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.store.access.RAMAccessManager.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.db.BasicDatabase.bootStore(Unknown Source)
at org.apache.derby.impl.db.BasicDatabase.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.bootService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startProviderService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.findProviderAndStartService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startPersistentService(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.startPersistentService(Unknown Source)
... 95 more
------
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at com.jolbox.bonecp.PoolUtil.generateSQLException(PoolUtil.java:192)
at com.jolbox.bonecp.BoneCP.<init>(BoneCP.java:422)
at com.jolbox.bonecp.BoneCPDataSource.getConnection(BoneCPDataSource.java:120)
at org.datanucleus.store.rdbms.ConnectionFactoryImpl$ManagedConnectionImpl.getConnection(ConnectionFactoryImpl.java:501)
at org.datanucleus.store.rdbms.RDBMSStoreManager.<init>(RDBMSStoreManager.java:298)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.datanucleus.plugin.NonManagedPluginRegistry.createExecutableExtension(NonManagedPluginRegistry.java:631)
at org.datanucleus.plugin.PluginManager.createExecutableExtension(PluginManager.java:301)
at org.datanucleus.NucleusContext.createStoreManagerForProperties(NucleusContext.java:1187)
at org.datanucleus.NucleusContext.initialise(NucleusContext.java:356)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.freezeConfiguration(JDOPersistenceManagerFactory.java:775)
... 73 more
Caused by: java.sql.SQLException: Failed to start database 'metastore_db' with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@7a1ece89, see the next exception for details.
at org.apache.derby.impl.jdbc.SQLExceptionFactory.getSQLException(Unknown Source)
at org.apache.derby.impl.jdbc.SQLExceptionFactory.getSQLException(Unknown Source)
at org.apache.derby.impl.jdbc.Util.seeNextException(Unknown Source)
at org.apache.derby.impl.jdbc.EmbedConnection.bootDatabase(Unknown Source)
at org.apache.derby.impl.jdbc.EmbedConnection.<init>(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.getNewEmbedConnection(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.connect(Unknown Source)
at org.apache.derby.jdbc.InternalDriver.connect(Unknown Source)
at org.apache.derby.jdbc.AutoloadedDriver.connect(Unknown Source)
at java.sql.DriverManager.getConnection(DriverManager.java:664)
at java.sql.DriverManager.getConnection(DriverManager.java:208)
at com.jolbox.bonecp.BoneCP.obtainRawInternalConnection(BoneCP.java:361)
at com.jolbox.bonecp.BoneCP.<init>(BoneCP.java:416)
... 85 more
Caused by: ERROR XJ040: Failed to start database 'metastore_db' with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@7a1ece89, see the next exception for details.
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.impl.jdbc.SQLExceptionFactory.wrapArgsForTransportAcrossDRDA(Unknown Source)
... 98 more
Caused by: ERROR XSDB6: Another instance of Derby may have already booted the database /home/svanhmic/workspace/Python/Erhvervs/src/notebooks/cvr/metastore_db.
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.privGetJBMSLockOnDB(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.run(Unknown Source)
at java.security.AccessController.doPrivileged(Native Method)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.getJBMSLockOnDB(Unknown Source)
at org.apache.derby.impl.store.raw.data.BaseDataFileFactory.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.store.raw.RawStore.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.store.access.RAMAccessManager.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startModule(Unknown Source)
at org.apache.derby.impl.services.monitor.FileMonitor.startModule(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.bootServiceModule(Unknown Source)
at org.apache.derby.impl.db.BasicDatabase.bootStore(Unknown Source)
at org.apache.derby.impl.db.BasicDatabase.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.boot(Unknown Source)
at org.apache.derby.impl.services.monitor.TopService.bootModule(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.bootService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startProviderService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.findProviderAndStartService(Unknown Source)
at org.apache.derby.impl.services.monitor.BaseMonitor.startPersistentService(Unknown Source)
at org.apache.derby.iapi.services.monitor.Monitor.startPersistentService(Unknown Source)
... 95 more
In [4]:
cvrDf.printSchema()
root
|-- virksomhed: struct (nullable = true)
| |-- aarsbeskaeftigelse: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- aar: long (nullable = true)
| | | |-- intervalKodeAntalAarsvaerk: string (nullable = true)
| | | |-- intervalKodeAntalAnsatte: string (nullable = true)
| | | |-- intervalKodeAntalInklusivEjere: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- attributter: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- sekvensnr: long (nullable = true)
| | | |-- type: string (nullable = true)
| | | |-- vaerdier: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- periode: struct (nullable = true)
| | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | |-- gyldigTil: string (nullable = true)
| | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | |-- vaerdi: string (nullable = true)
| | | |-- vaerditype: string (nullable = true)
| |-- beliggenhedsadresse: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- bogstavFra: string (nullable = true)
| | | |-- bogstavTil: string (nullable = true)
| | | |-- bynavn: string (nullable = true)
| | | |-- conavn: string (nullable = true)
| | | |-- etage: string (nullable = true)
| | | |-- fritekst: string (nullable = true)
| | | |-- husnummerFra: long (nullable = true)
| | | |-- husnummerTil: long (nullable = true)
| | | |-- kommune: struct (nullable = true)
| | | | |-- kommuneKode: long (nullable = true)
| | | | |-- kommuneNavn: string (nullable = true)
| | | | |-- periode: struct (nullable = true)
| | | | | |-- gyldigFra: string (nullable = true)
| | | | | |-- gyldigTil: string (nullable = true)
| | | | |-- sidstOpdateret: string (nullable = true)
| | | |-- landekode: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- postboks: string (nullable = true)
| | | |-- postdistrikt: string (nullable = true)
| | | |-- postnummer: long (nullable = true)
| | | |-- sidedoer: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | | |-- vejkode: long (nullable = true)
| | | |-- vejnavn: string (nullable = true)
| |-- bibranche1: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- branchekode: string (nullable = true)
| | | |-- branchetekst: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- bibranche2: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- branchekode: string (nullable = true)
| | | |-- branchetekst: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- bibranche3: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- branchekode: string (nullable = true)
| | | |-- branchetekst: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- binavne: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- navn: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- brancheAnsvarskode: long (nullable = true)
| |-- cvrNummer: long (nullable = true)
| |-- dataAdgang: long (nullable = true)
| |-- deltagerRelation: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- deltager: struct (nullable = true)
| | | | |-- beliggenhedsadresse: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- bogstavFra: string (nullable = true)
| | | | | | |-- bogstavTil: string (nullable = true)
| | | | | | |-- bynavn: string (nullable = true)
| | | | | | |-- conavn: string (nullable = true)
| | | | | | |-- etage: string (nullable = true)
| | | | | | |-- fritekst: string (nullable = true)
| | | | | | |-- husnummerFra: long (nullable = true)
| | | | | | |-- husnummerTil: long (nullable = true)
| | | | | | |-- kommune: struct (nullable = true)
| | | | | | | |-- kommuneKode: long (nullable = true)
| | | | | | | |-- kommuneNavn: string (nullable = true)
| | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | |-- landekode: string (nullable = true)
| | | | | | |-- periode: struct (nullable = true)
| | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | |-- postboks: string (nullable = true)
| | | | | | |-- postdistrikt: string (nullable = true)
| | | | | | |-- postnummer: long (nullable = true)
| | | | | | |-- sidedoer: string (nullable = true)
| | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | |-- vejkode: long (nullable = true)
| | | | | | |-- vejnavn: string (nullable = true)
| | | | |-- enhedsNummer: long (nullable = true)
| | | | |-- enhedstype: string (nullable = true)
| | | | |-- forretningsnoegle: long (nullable = true)
| | | | |-- navne: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- navn: string (nullable = true)
| | | | | | |-- periode: struct (nullable = true)
| | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | |-- sidstIndlaest: string (nullable = true)
| | | |-- kontorsteder: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- attributter: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- sekvensnr: long (nullable = true)
| | | | | | | |-- type: string (nullable = true)
| | | | | | | |-- vaerdier: array (nullable = true)
| | | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | | | | |-- vaerdi: string (nullable = true)
| | | | | | | |-- vaerditype: string (nullable = true)
| | | | | |-- penhed: struct (nullable = true)
| | | | | | |-- beliggenhedsadresse: array (nullable = true)
| | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | |-- bogstavFra: string (nullable = true)
| | | | | | | | |-- bogstavTil: string (nullable = true)
| | | | | | | | |-- bynavn: string (nullable = true)
| | | | | | | | |-- conavn: string (nullable = true)
| | | | | | | | |-- etage: string (nullable = true)
| | | | | | | | |-- husnummerFra: long (nullable = true)
| | | | | | | | |-- husnummerTil: long (nullable = true)
| | | | | | | | |-- kommune: struct (nullable = true)
| | | | | | | | | |-- kommuneKode: long (nullable = true)
| | | | | | | | | |-- kommuneNavn: string (nullable = true)
| | | | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | | | |-- landekode: string (nullable = true)
| | | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | | |-- postboks: string (nullable = true)
| | | | | | | | |-- postdistrikt: string (nullable = true)
| | | | | | | | |-- postnummer: long (nullable = true)
| | | | | | | | |-- sidedoer: string (nullable = true)
| | | | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | | | |-- vejkode: long (nullable = true)
| | | | | | | | |-- vejnavn: string (nullable = true)
| | | | | | |-- enhedsNummer: long (nullable = true)
| | | | | | |-- enhedstype: string (nullable = true)
| | | | | | |-- forretningsnoegle: long (nullable = true)
| | | | | | |-- navne: array (nullable = true)
| | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | |-- navn: string (nullable = true)
| | | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | |-- sidstIndlaest: string (nullable = true)
| | | |-- organisationer: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- attributter: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- sekvensnr: long (nullable = true)
| | | | | | | |-- type: string (nullable = true)
| | | | | | | |-- vaerdier: array (nullable = true)
| | | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | | | | |-- vaerdi: string (nullable = true)
| | | | | | | |-- vaerditype: string (nullable = true)
| | | | | |-- enhedsNummerOrganisation: long (nullable = true)
| | | | | |-- hovedtype: string (nullable = true)
| | | | | |-- medlemsData: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- attributter: array (nullable = true)
| | | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | | |-- sekvensnr: long (nullable = true)
| | | | | | | | | |-- type: string (nullable = true)
| | | | | | | | | |-- vaerdier: array (nullable = true)
| | | | | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | | | | | | |-- vaerdi: string (nullable = true)
| | | | | | | | | |-- vaerditype: string (nullable = true)
| | | | | |-- organisationsNavn: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- navn: string (nullable = true)
| | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | |-- sidstOpdateret: string (nullable = true)
| |-- elektroniskPost: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- hemmelig: boolean (nullable = true)
| | | |-- kontaktoplysning: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- enhedsNummer: long (nullable = true)
| |-- enhedstype: string (nullable = true)
| |-- fejlBeskrivelse: string (nullable = true)
| |-- fejlRegistreret: boolean (nullable = true)
| |-- fejlVedIndlaesning: boolean (nullable = true)
| |-- fusioner: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- enhedsNummerOrganisation: long (nullable = true)
| | | |-- indgaaende: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- sekvensnr: long (nullable = true)
| | | | | |-- type: string (nullable = true)
| | | | | |-- vaerdier: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | | |-- vaerdi: string (nullable = true)
| | | | | |-- vaerditype: string (nullable = true)
| | | |-- organisationsNavn: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- navn: string (nullable = true)
| | | | | |-- periode: struct (nullable = true)
| | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | |-- gyldigTil: string (nullable = true)
| | | | | |-- sidstOpdateret: string (nullable = true)
| | | |-- udgaaende: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- sekvensnr: long (nullable = true)
| | | | | |-- type: string (nullable = true)
| | | | | |-- vaerdier: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | | |-- vaerdi: string (nullable = true)
| | | | | |-- vaerditype: string (nullable = true)
| |-- hjemmeside: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- hemmelig: boolean (nullable = true)
| | | |-- kontaktoplysning: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- hovedbranche: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- branchekode: string (nullable = true)
| | | |-- branchetekst: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- kvartalsbeskaeftigelse: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- aar: long (nullable = true)
| | | |-- intervalKodeAntalAarsvaerk: string (nullable = true)
| | | |-- intervalKodeAntalAnsatte: string (nullable = true)
| | | |-- kvartal: long (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- livsforloeb: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- maanedsbeskaeftigelse: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- aar: long (nullable = true)
| | | |-- intervalKodeAntalAarsvaerk: string (nullable = true)
| | | |-- intervalKodeAntalAnsatte: string (nullable = true)
| | | |-- maaned: long (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- naermesteFremtidigeDato: string (nullable = true)
| |-- navne: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- navn: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- obligatoriskEmail: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- hemmelig: boolean (nullable = true)
| | | |-- kontaktoplysning: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- penheder: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- pNummer: long (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- postadresse: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- bogstavFra: string (nullable = true)
| | | |-- bynavn: string (nullable = true)
| | | |-- conavn: string (nullable = true)
| | | |-- etage: string (nullable = true)
| | | |-- fritekst: string (nullable = true)
| | | |-- husnummerFra: long (nullable = true)
| | | |-- husnummerTil: long (nullable = true)
| | | |-- kommune: struct (nullable = true)
| | | | |-- kommuneKode: long (nullable = true)
| | | | |-- kommuneNavn: string (nullable = true)
| | | | |-- periode: struct (nullable = true)
| | | | | |-- gyldigFra: string (nullable = true)
| | | | | |-- gyldigTil: string (nullable = true)
| | | | |-- sidstOpdateret: string (nullable = true)
| | | |-- landekode: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- postboks: string (nullable = true)
| | | |-- postdistrikt: string (nullable = true)
| | | |-- postnummer: long (nullable = true)
| | | |-- sidedoer: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | | |-- vejkode: long (nullable = true)
| | | |-- vejnavn: string (nullable = true)
| |-- regNummer: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- regnummer: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- reklamebeskyttet: boolean (nullable = true)
| |-- samtId: long (nullable = true)
| |-- sidstIndlaest: string (nullable = true)
| |-- sidstOpdateret: string (nullable = true)
| |-- spaltninger: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- enhedsNummerOrganisation: long (nullable = true)
| | | |-- indgaaende: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- sekvensnr: long (nullable = true)
| | | | | |-- type: string (nullable = true)
| | | | | |-- vaerdier: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | | |-- vaerdi: string (nullable = true)
| | | | | |-- vaerditype: string (nullable = true)
| | | |-- organisationsNavn: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- navn: string (nullable = true)
| | | | | |-- periode: struct (nullable = true)
| | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | |-- gyldigTil: string (nullable = true)
| | | | | |-- sidstOpdateret: string (nullable = true)
| | | |-- udgaaende: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- sekvensnr: long (nullable = true)
| | | | | |-- type: string (nullable = true)
| | | | | |-- vaerdier: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- periode: struct (nullable = true)
| | | | | | | | |-- gyldigFra: string (nullable = true)
| | | | | | | | |-- gyldigTil: string (nullable = true)
| | | | | | | |-- sidstOpdateret: string (nullable = true)
| | | | | | | |-- vaerdi: string (nullable = true)
| | | | | |-- vaerditype: string (nullable = true)
| |-- status: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- kreditoplysningkode: long (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | | |-- statuskode: long (nullable = true)
| |-- telefaxNummer: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- hemmelig: boolean (nullable = true)
| | | |-- kontaktoplysning: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- telefonNummer: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- hemmelig: boolean (nullable = true)
| | | |-- kontaktoplysning: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| |-- virkningsAktoer: string (nullable = true)
| |-- virksomhedMetadata: struct (nullable = true)
| | |-- antalPenheder: long (nullable = true)
| | |-- nyesteAarsbeskaeftigelse: struct (nullable = true)
| | | |-- aar: long (nullable = true)
| | | |-- intervalKodeAntalAarsvaerk: string (nullable = true)
| | | |-- intervalKodeAntalAnsatte: string (nullable = true)
| | | |-- intervalKodeAntalInklusivEjere: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | |-- nyesteBeliggenhedsadresse: struct (nullable = true)
| | | |-- bogstavFra: string (nullable = true)
| | | |-- bogstavTil: string (nullable = true)
| | | |-- bynavn: string (nullable = true)
| | | |-- conavn: string (nullable = true)
| | | |-- etage: string (nullable = true)
| | | |-- fritekst: string (nullable = true)
| | | |-- husnummerFra: long (nullable = true)
| | | |-- husnummerTil: long (nullable = true)
| | | |-- kommune: struct (nullable = true)
| | | | |-- kommuneKode: long (nullable = true)
| | | | |-- kommuneNavn: string (nullable = true)
| | | | |-- periode: struct (nullable = true)
| | | | | |-- gyldigFra: string (nullable = true)
| | | | | |-- gyldigTil: string (nullable = true)
| | | | |-- sidstOpdateret: string (nullable = true)
| | | |-- landekode: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- postboks: string (nullable = true)
| | | |-- postdistrikt: string (nullable = true)
| | | |-- postnummer: long (nullable = true)
| | | |-- sidedoer: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | | |-- vejkode: long (nullable = true)
| | | |-- vejnavn: string (nullable = true)
| | |-- nyesteBibranche1: struct (nullable = true)
| | | |-- branchekode: string (nullable = true)
| | | |-- branchetekst: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | |-- nyesteBibranche2: struct (nullable = true)
| | | |-- branchekode: string (nullable = true)
| | | |-- branchetekst: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | |-- nyesteBibranche3: struct (nullable = true)
| | | |-- branchekode: string (nullable = true)
| | | |-- branchetekst: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | |-- nyesteFadCprnumre: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- nyesteHovedbranche: struct (nullable = true)
| | | |-- branchekode: string (nullable = true)
| | | |-- branchetekst: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | |-- nyesteKontaktoplysninger: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- nyesteKvartalsbeskaeftigelse: struct (nullable = true)
| | | |-- aar: long (nullable = true)
| | | |-- intervalKodeAntalAarsvaerk: string (nullable = true)
| | | |-- intervalKodeAntalAnsatte: string (nullable = true)
| | | |-- kvartal: long (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | |-- nyesteMaanedsbeskaeftigelse: struct (nullable = true)
| | | |-- aar: long (nullable = true)
| | | |-- intervalKodeAntalAarsvaerk: string (nullable = true)
| | | |-- intervalKodeAntalAnsatte: string (nullable = true)
| | | |-- maaned: long (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | |-- nyesteNavn: struct (nullable = true)
| | | |-- navn: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | |-- nyesteStatus: struct (nullable = true)
| | | |-- kreditoplysningkode: long (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | | |-- statuskode: long (nullable = true)
| | |-- nyesteVirksomhedsform: struct (nullable = true)
| | | |-- ansvarligDataleverandoer: string (nullable = true)
| | | |-- kortBeskrivelse: string (nullable = true)
| | | |-- langBeskrivelse: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | | |-- virksomhedsformkode: long (nullable = true)
| | |-- sammensatStatus: string (nullable = true)
| | |-- stiftelsesDato: string (nullable = true)
| | |-- virkningsDato: string (nullable = true)
| |-- virksomhedsform: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- ansvarligDataleverandoer: string (nullable = true)
| | | |-- kortBeskrivelse: string (nullable = true)
| | | |-- langBeskrivelse: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | | |-- virksomhedsformkode: long (nullable = true)
| |-- virksomhedsstatus: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | | |-- status: string (nullable = true)
In [5]:
def getNextSchemaLayer(schema,idx,name="name"):
schemaDict = schema[idx].jsonValue()
return list([i[name] for i in schemaDict["type"]["fields"][:]])
In [6]:
def createNextLayerTable(df,nonExplodedColumns,explodedColumn,*nonExplodedPrefix):
'''
The method takes a dataframe and explodes a column of choice such that the contence is more accessible
input
- df: data frame with information
- nonExplodedColumns: List with columns that are "static" the columns are not altered e.g. cvr
- explodedColumn: String with the name of the column that gets exploded, note that column must be of an arraytype
- nonExplodedPrefix: String if values are nested inside a
output
- output: data frame where nonExplodedColumns are unchanged, except for that the values are dupplicated with
for the explodedColum
'''
if len(nonExplodedPrefix) == 1:
prefixedStr = nonExplodedPrefix[0]+"."
else:
prefixedStr = ".".join(nonExplodedPrefix)
if nonExplodedPrefix is None:
relationsDf = df.select([df[v].alias(v) for v in nonExplodedColumns]+
[F.explode(df[explodedColumn]).alias(explodedColumn)])
else:
relationsDf = df.select([df[prefixedStr+v].alias(v) for v in nonExplodedColumns]+
[F.explode(df[prefixedStr+explodedColumn]).alias(explodedColumn)])
dfSchema = getNextSchemaLayer(relationsDf.schema,explodedColumn)
return (relationsDf
.select([relationsDf[u] for u in nonExplodedColumns]
+[relationsDf[explodedColumn][v].alias(v) for v in dfSchema]))
In [7]:
def expandSubCols(df,mainColumn):
'''
The method expands all subcolumns in the next layer of mainColumn
input:
df - data frame with data
mainColumn - the column that contains the subcolumns that should be flattent
'''
dfCols = df.columns
dfCols.remove(mainColumn)
return df.select([df[v] for v in dfCols]+
[df[mainColumn][u].alias(u) for u in getNextSchemaLayer(schema=df.schema,idx=mainColumn)])
In [ ]:
In [8]:
secondLayerSchema = getNextSchemaLayer(cvrDf.schema,"virksomhed")
#cvrDf.select(cvrDf["virksomhed.attributter.sekvensnr"]).show()
print(secondLayerSchema)
['aarsbeskaeftigelse', 'attributter', 'beliggenhedsadresse', 'bibranche1', 'bibranche2', 'bibranche3', 'binavne', 'brancheAnsvarskode', 'cvrNummer', 'dataAdgang', 'deltagerRelation', 'elektroniskPost', 'enhedsNummer', 'enhedstype', 'fejlBeskrivelse', 'fejlRegistreret', 'fejlVedIndlaesning', 'fusioner', 'hjemmeside', 'hovedbranche', 'kvartalsbeskaeftigelse', 'livsforloeb', 'maanedsbeskaeftigelse', 'naermesteFremtidigeDato', 'navne', 'obligatoriskEmail', 'penheder', 'postadresse', 'regNummer', 'reklamebeskyttet', 'samtId', 'sidstIndlaest', 'sidstOpdateret', 'spaltninger', 'status', 'telefaxNummer', 'telefonNummer', 'virkningsAktoer', 'virksomhedMetadata', 'virksomhedsform', 'virksomhedsstatus']
In [9]:
#Aarsvaerk as a table
vaekstDf = createNextLayerTable(cvrDf,["cvrNummer"],"aarsbeskaeftigelse","virksomhed")
#vaekstDf.show()
regexedVaekstDf = (vaekstDf.select(vaekstDf["cvrNummer"],vaekstDf["aar"]
,F.split(F.regexp_extract("intervalKodeAntalAarsvaerk",r'(\d{1,4}_\d{1,4})',0),r"\_").alias("intervalKodeAntalAarsvaerk")
,F.split(F.regexp_extract("intervalKodeAntalAnsatte",r'(\d{1,4}_\d{1,4})',0),r"\_").alias("intervalKodeAntalAnsatte")
,F.split(F.regexp_extract("intervalKodeAntalInklusivEjere",r'(\d{1,4}_\d{1,4})',0),r"\_").alias("intervalKodeAntalInklusivEjere")
,vaekstDf["sidstOpdateret"]))
regexedVaekstDf.orderBy(["cvrNummer","aar"]).show(50)
+---------+----+--------------------------+------------------------+------------------------------+--------------------+
|cvrNummer| aar|intervalKodeAntalAarsvaerk|intervalKodeAntalAnsatte|intervalKodeAntalInklusivEjere| sidstOpdateret|
+---------+----+--------------------------+------------------------+------------------------------+--------------------+
| 10000009|1999| [1, 1]| [1, 1]| [1, 1]|2001-07-24T15:50:...|
| 10000025|2000| [1, 1]| [0, 0]| [0, 0]|2002-05-24T17:47:...|
| 10000025|2001| [1, 1]| [1, 1]| [1, 1]|2003-05-12T17:30:...|
| 10000025|2002| [1, 1]| [1, 1]| [1, 1]|2004-06-25T17:50:...|
| 10000025|2003| [1, 1]| [1, 1]| [1, 1]|2005-12-02T17:06:...|
| 10000025|2004| [1, 1]| [1, 1]| [1, 1]|2006-05-18T17:33:...|
| 10000025|2005| [1, 1]| [2, 4]| [2, 4]|2007-07-18T13:50:...|
| 10000025|2006| [1, 1]| [2, 4]| [2, 4]|2008-10-03T12:44:...|
| 10000025|2007| [1, 1]| [2, 4]| [2, 4]|2009-09-08T12:46:...|
| 10000025|2008| [1, 1]| [1, 1]| [1, 1]|2010-10-14T12:43:...|
| 10000025|2009| [1, 1]| [0, 0]| [0, 0]|2011-11-30T11:43:...|
| 10000106|1997| [1, 1]| [2, 4]| [2, 4]|1999-10-17T02:51:...|
| 10000106|1998| [1, 1]| [2, 4]| [2, 4]|2000-06-29T15:32:...|
| 10000106|1999| [2, 4]| [2, 4]| [2, 4]|2001-07-24T15:53:...|
| 10000106|2000| [1, 1]| [2, 4]| [2, 4]|2002-05-24T15:47:...|
| 10000106|2001| [2, 4]| [2, 4]| [2, 4]|2003-05-12T15:30:...|
| 10000106|2002| [2, 4]| [2, 4]| [2, 4]|2004-06-25T15:50:...|
| 10000106|2003| [2, 4]| [2, 4]| [2, 4]|2005-12-02T16:07:...|
| 10000106|2004| [2, 4]| [2, 4]| [2, 4]|2006-05-18T15:30:...|
| 10000106|2005| [2, 4]| [5, 9]| [5, 9]|2007-12-03T10:43:...|
| 10000106|2006| [2, 4]| [5, 9]| [5, 9]|2008-10-03T10:45:...|
| 10000122|1999| [1, 1]| [1, 1]| [1, 1]|2001-07-24T17:37:...|
| 10000122|2000| [2, 4]| [5, 9]| [5, 9]|2002-05-24T17:47:...|
| 10000122|2001| [1, 1]| [0, 0]| [0, 0]|2003-05-12T17:30:...|
| 10000173|2011| [0, 0]| [0, 0]| [1, 1]|2013-07-18T11:00:...|
| 10000211|1999| [1, 1]| [2, 4]| [2, 4]|2001-07-24T17:37:...|
| 10000211|2000| [1, 1]| [0, 0]| [0, 0]|2002-05-24T17:47:...|
| 10000254|1999| [1, 1]| [2, 4]| [2, 4]|2001-07-24T15:50:...|
| 10000254|2000| [20, 49]| [20, 49]| [20, 49]|2002-05-24T15:47:...|
| 10000254|2001| [20, 49]| [20, 49]| [20, 49]|2003-05-12T15:30:...|
| 10000254|2002| [20, 49]| [20, 49]| [20, 49]|2004-06-25T15:50:...|
| 10000254|2003| [20, 49]| [20, 49]| [20, 49]|2005-12-02T16:06:...|
| 10000254|2004| [20, 49]| [50, 99]| [50, 99]|2006-05-18T15:33:...|
| 10000254|2005| [20, 49]| [50, 99]| [50, 99]|2007-07-18T11:50:...|
| 10000254|2006| [20, 49]| [50, 99]| [50, 99]|2008-10-03T10:44:...|
| 10000254|2007| [20, 49]| [20, 49]| [20, 49]|2009-09-08T10:46:...|
| 10000254|2008| [50, 99]| [50, 99]| [50, 99]|2010-10-14T10:43:...|
| 10000254|2009| [20, 49]| [20, 49]| [20, 49]|2011-11-30T10:43:...|
| 10000262|2009| [1, 1]| [2, 4]| [2, 4]|2011-11-30T11:43:...|
| 10000270|2002| [0, 0]| [0, 0]| [1, 1]|2004-06-25T15:50:...|
| 10000270|2004| [0, 0]| [0, 0]| [1, 1]|2006-05-18T15:30:...|
| 10000297|1998| [0, 0]| [0, 0]| [1, 1]|2000-06-29T15:32:...|
| 10000297|1999| [0, 0]| [0, 0]| [1, 1]|2001-07-24T15:59:...|
| 10000297|2000| [0, 0]| [0, 0]| [1, 1]|2002-05-24T15:47:...|
| 10000297|2001| [0, 0]| [0, 0]| [1, 1]|2003-05-12T15:32:...|
| 10000297|2002| [0, 0]| [0, 0]| [1, 1]|2004-06-25T15:50:...|
| 10000297|2003| [0, 0]| [0, 0]| [1, 1]|2005-12-02T16:07:...|
| 10000297|2004| [0, 0]| [0, 0]| [1, 1]|2006-05-18T15:30:...|
| 10000297|2005| [0, 0]| [0, 0]| [1, 1]|2007-07-18T11:45:...|
| 10000297|2006| [0, 0]| [0, 0]| [1, 1]|2008-10-03T10:45:...|
+---------+----+--------------------------+------------------------+------------------------------+--------------------+
only showing top 50 rows
In [30]:
#Kvartalsverk as a tabel
maanedsVerkDf = createNextLayerTable(cvrDf,["cvrNummer"],"kvartalsbeskaeftigelse","virksomhed")
#maanedsVerkDf.show()
reMaanedsVerkDf = (maanedsVerkDf
.select(maanedsVerkDf["cvrNummer"],maanedsVerkDf["aar"],maanedsVerkDf["kvartal"]
,F.split(F.regexp_extract("intervalKodeAntalAarsvaerk",r'(\d{1,4}_\d{1,4})',0),r"\_").alias("intervalKodeAntalAarsvaerk")
,F.split(F.regexp_extract("intervalKodeAntalAnsatte",r'(\d{1,4}_\d{1,4})',0),r"\_").alias("intervalKodeAntalAnsatte")
,maanedsVerkDf["sidstOpdateret"]))
reMaanedsVerkDf.show()
print(reMaanedsVerkDf.dtypes)
+---------+----+-------+--------------------------+------------------------+--------------------+
|cvrNummer| aar|kvartal|intervalKodeAntalAarsvaerk|intervalKodeAntalAnsatte| sidstOpdateret|
+---------+----+-------+--------------------------+------------------------+--------------------+
| 27850367|2004| 3| null| [1, 1]|2005-04-19T17:38:...|
| 27850367|2004| 4| null| [1, 1]|2005-10-04T12:50:...|
| 27850367|2005| 1| null| [1, 1]|2005-10-04T17:32:...|
| 27850367|2005| 2| null| [1, 1]|2005-12-02T16:56:...|
| 27850367|2005| 3| null| [0, 0]|2006-04-11T17:32:...|
| 27850367|2005| 4| null| [2, 4]|2006-09-01T12:45:...|
| 27850367|2006| 1| null| [1, 1]|2007-01-11T16:33:...|
| 27850367|2006| 2| null| [1, 1]|2007-01-12T10:46:...|
| 27850367|2006| 3| null| [1, 1]|2007-07-18T14:12:...|
| 27850367|2006| 4| null| [0, 0]|2007-09-03T10:04:...|
| 27850367|2007| 1| null| [0, 0]|2007-09-03T11:14:...|
| 26452635|2002| 1| null| [0, 0]|2002-09-24T17:46:...|
| 26452635|2002| 2| null| [1, 1]|2002-12-05T16:46:...|
| 26452635|2002| 3| null| [1, 1]|2003-03-13T16:50:...|
| 26452635|2002| 4| null| [1, 1]|2003-05-21T18:19:...|
| 26452635|2003| 1| null| [1, 1]|2003-09-11T19:52:...|
| 26452635|2003| 2| null| [1, 1]|2003-12-15T16:43:...|
| 26452635|2004| 3| null| [1, 1]|2005-04-19T17:38:...|
| 26452635|2004| 4| null| [1, 1]|2005-10-04T12:49:...|
| 26452635|2005| 1| null| [1, 1]|2005-10-04T17:31:...|
+---------+----+-------+--------------------------+------------------------+--------------------+
only showing top 20 rows
[('cvrNummer', 'bigint'), ('aar', 'bigint'), ('kvartal', 'bigint'), ('intervalKodeAntalAarsvaerk', 'array<string>'), ('intervalKodeAntalAnsatte', 'array<string>'), ('sidstOpdateret', 'string')]
In [10]:
#Get the attributes with values and periods as a table
attributesDf = createNextLayerTable(cvrDf,["cvrNummer"],"attributter","virksomhed")
#attributesDf.show(10)
#print(attributesDf.schema["cvrNummer"])
#get column names
attributesCols = attributesDf.columns
attributesCols.remove("vaerdier")
attributesWithValueDf = createNextLayerTable(attributesDf,attributesCols,"vaerdier")
#attributesWithValueDf.show(10)
attributesWithValueAndPeriodDf = expandSubCols(attributesWithValueDf,"periode")
attributesWithValueAndPeriodDf.orderBy(["cvrNummer","type"],ascending=[1, 1]).show(100)
+---------+---------+--------------------+----------+--------------------+--------------------+----------+----------+
|cvrNummer|sekvensnr| type|vaerditype| sidstOpdateret| vaerdi| gyldigFra| gyldigTil|
+---------+---------+--------------------+----------+--------------------+--------------------+----------+----------+
| 10000009| 0|ARKIV_REGISTRERIN...| string|2015-02-09T20:00:...| ApS261654|1999-10-12|2001-12-11|
| 10000009| 0| FORMÅL| string|2015-02-09T20:00:...|Selskabets formål...|1999-10-12|2001-12-11|
| 10000009| 0|FØRSTE_REGNSKABSP...| date|2015-02-09T20:00:...| 2000-09-30|1999-10-12|2001-12-11|
| 10000009| 0|FØRSTE_REGNSKABSP...| date|2015-02-09T20:00:...| 1999-08-01|1999-10-12|2001-12-11|
| 10000009| 0| KAPITAL| decimal|2015-02-09T20:00:...| 125000.00|1999-10-12|2001-12-11|
| 10000009| 0| KAPITALVALUTA| string|2015-02-09T20:00:...| DKK|1999-10-12|2001-12-11|
| 10000009| 0| NAVN_IDENTITET| string|2015-02-09T20:00:...| YELLOW|1999-10-12|2001-12-11|
| 10000009| 0| PSEUDOCVRNR| boolean|2015-02-09T20:00:...| false|1999-10-12|2001-12-11|
| 10000009| 0| REGNSKABSÅR_SLUT| gMonthDay|2015-02-09T20:00:...| --09-30|1999-10-12|2001-12-11|
| 10000009| 0| REGNSKABSÅR_START| gMonthDay|2015-02-09T20:00:...| --10-01|1999-10-12|2001-12-11|
| 10000009| 0| TEGNINGSREGEL| string|2015-02-09T20:00:...|Selskabet tegnes ...|1999-10-12|2001-12-11|
| 10000009| 0| VEDTÆGT_SENESTE| date|2015-02-09T20:00:...| 1999-10-12|1999-10-12|2001-12-11|
| 10000025| 0|ARKIV_REGISTRERIN...| string|2015-02-10T00:00:...| ApS261655|1999-10-13| null|
| 10000025| 0|EJERREGISTRERING_...| boolean|2015-11-16T15:12:...| false|2015-11-16| null|
| 10000025| 0| FORMÅL| string|2015-02-10T00:00:...|Selskabets formål...|2013-05-31| null|
| 10000025| 0|FØRSTE_REGNSKABSP...| date|2015-02-10T00:00:...| 2000-12-31|1999-10-13| null|
| 10000025| 0|FØRSTE_REGNSKABSP...| date|2015-02-10T00:00:...| 1999-10-13|1999-10-13| null|
| 10000025| 0|GENOPTAGELSE_TVAN...| date|2015-02-10T00:00:...| 2012-05-31|2012-05-31| null|
| 10000025| 0| KAPITAL| decimal|2015-02-10T00:00:...| 125000.00|1999-10-13| null|
| 10000025| 0| KAPITALVALUTA| string|2015-02-10T00:00:...| DKK|1999-10-13| null|
| 10000025| 0| NAVN_IDENTITET| string|2015-02-10T00:00:...|WATERFRONT CONNEC...|1999-10-13| null|
| 10000025| 1| NAVN_IDENTITET| string|2015-02-10T00:00:...| GUITAR LOUNGE|1999-10-13| null|
| 10000025| 0| PSEUDOCVRNR| boolean|2015-02-10T00:00:...| false|1999-10-13| null|
| 10000025| 0| REGNSKABSÅR_SLUT| gMonthDay|2015-02-10T00:00:...| --12-31|1999-10-13| null|
| 10000025| 0| REGNSKABSÅR_START| gMonthDay|2015-02-10T00:00:...| --01-01|1999-10-13| null|
| 10000025| 0| REVISION_FRAVALGT| boolean|2015-02-10T00:00:...| true|1999-10-13| null|
| 10000025| 0| TEGNINGSREGEL| string|2015-02-10T00:00:...|Selskabets tegnes...|2013-05-31| null|
| 10000025| 0| VEDTÆGT_SENESTE| date|2015-02-10T00:00:...| 2013-05-31|2013-05-31| null|
| 10000068| 0|ARKIV_REGISTRERIN...| string|2015-02-09T21:00:...| FAS263254|1999-08-25|2001-12-20|
| 10000068| 0| FORMÅL| string|2015-02-09T21:00:...|Filialens formål ...|1999-08-25|2001-12-20|
| 10000068| 0| NAVN_IDENTITET| string|2015-02-09T21:00:...|STUDENTCONSULTING...|1999-10-18|2001-12-20|
| 10000068| 0| PSEUDOCVRNR| boolean|2015-02-09T21:00:...| false|1999-08-25|2001-12-20|
| 10000068| 0| TEGNINGSREGEL| string|2015-02-09T21:00:...|Filialen tegnes a...|1999-08-25|2001-12-20|
| 10000122| 0|ARKIV_REGISTRERIN...| string|2015-02-09T21:00:...| ApS261686|1999-10-14|2002-08-15|
| 10000122| 0| FORMÅL| string|2015-02-09T21:00:...|Selskabets formål...|2000-02-01|2002-08-15|
| 10000122| 0|FØRSTE_REGNSKABSP...| date|2015-02-09T21:00:...| 2000-09-30|1999-10-14|2002-08-15|
| 10000122| 0|FØRSTE_REGNSKABSP...| date|2015-02-09T21:00:...| 1999-10-14|1999-10-14|2002-08-15|
| 10000122| 0| KAPITAL| decimal|2015-02-09T21:00:...| 125000.00|1999-10-14|2002-08-15|
| 10000122| 0| KAPITALVALUTA| string|2015-02-09T21:00:...| DKK|1999-10-14|2002-08-15|
| 10000122| 0| NAVN_IDENTITET| string|2015-02-09T21:00:...| DIGITAL CENTER FYN|1999-10-14|2002-08-15|
| 10000122| 0| PSEUDOCVRNR| boolean|2015-02-09T21:00:...| false|1999-10-14|2002-08-15|
| 10000122| 0| REGNSKABSÅR_SLUT| gMonthDay|2015-02-09T21:00:...| --09-30|1999-10-14|2002-08-15|
| 10000122| 0| REGNSKABSÅR_START| gMonthDay|2015-02-09T21:00:...| --10-01|1999-10-14|2002-08-15|
| 10000122| 0| TEGNINGSREGEL| string|2015-02-09T21:00:...|Selskabet tegnes ...|2000-02-01|2002-08-15|
| 10000122| 0| VEDTÆGT_SENESTE| date|2015-02-09T21:00:...| 2000-02-01|2000-02-01|2002-08-15|
| 10000157| 0|ARKIV_REGISTRERIN...| string|2015-02-10T00:00:...| ApS262280|1999-10-14| null|
| 10000157| 0| FORMÅL| string|2015-02-10T00:00:...|Selskabets formål...|1999-11-04| null|
| 10000157| 0|FØRSTE_REGNSKABSP...| date|2015-02-10T00:00:...| 2000-12-31|1999-10-14| null|
| 10000157| 0|FØRSTE_REGNSKABSP...| date|2015-02-10T00:00:...| 1999-10-14|1999-10-14| null|
| 10000157| 0| KAPITAL| decimal|2015-02-10T00:00:...| 125000.00|1999-11-04| null|
| 10000157| 0| KAPITALVALUTA| string|2015-02-10T00:00:...| DKK|1999-11-04| null|
| 10000157| 0| NAVN_IDENTITET| string|2015-02-10T00:00:...| WITT INVEST|1999-11-04| null|
| 10000157| 0| NAVN_IDENTITET| string|2015-02-10T00:00:...| WITT HOLDING|1999-10-14|1999-11-03|
| 10000157| 0| PSEUDOCVRNR| boolean|2015-02-10T00:00:...| false|1999-10-14| null|
| 10000157| 0| REGNSKABSÅR_SLUT| gMonthDay|2015-02-10T00:00:...| --12-31|1999-10-14| null|
| 10000157| 0| REGNSKABSÅR_START| gMonthDay|2015-02-10T00:00:...| --01-01|1999-10-14| null|
| 10000157| 0| REVISION_FRAVALGT| boolean|2015-03-12T00:00:...| false|1999-10-14| null|
| 10000157| 0| TEGNINGSREGEL| string|2015-02-10T00:00:...|Selskabet tegnes ...|1999-11-04| null|
| 10000157| 0| VEDTÆGT_SENESTE| date|2015-02-10T00:00:...| 1999-11-04|1999-11-04| null|
| 10000165| 0|ARKIV_REGISTRERIN...| string|2015-02-09T21:00:...| A/S261921|1999-10-13|2006-05-18|
| 10000165| 0| FORMÅL| string|2015-02-09T21:00:...|Selskabets formål...|2000-03-24|2006-05-18|
| 10000165| 0|FØRSTE_REGNSKABSP...| date|2015-02-09T21:00:...| 2001-03-31|1999-10-13|2006-05-18|
| 10000165| 0|FØRSTE_REGNSKABSP...| date|2015-02-09T21:00:...| 1999-10-13|1999-10-13|2006-05-18|
| 10000165| 0| KAPITAL| decimal|2015-02-09T21:00:...| 500000.00|1999-10-13|2006-05-18|
| 10000165| 0| KAPITALVALUTA| string|2015-02-09T21:00:...| DKK|1999-10-13|2006-05-18|
| 10000165| 0| NAVN_IDENTITET| string|2015-02-09T21:00:...| E.K. JORD|1999-10-13|2006-05-18|
| 10000165| 0| PSEUDOCVRNR| boolean|2015-02-09T21:00:...| false|1999-10-13|2006-05-18|
| 10000165| 0| REGNSKABSÅR_SLUT| gMonthDay|2015-02-09T21:00:...| --03-31|1999-10-13|2006-05-18|
| 10000165| 0| REGNSKABSÅR_START| gMonthDay|2015-02-09T21:00:...| --04-01|1999-10-13|2006-05-18|
| 10000165| 0| TEGNINGSREGEL| string|2015-02-09T21:00:...|Selskabet tegnes ...|2000-03-24|2006-05-18|
| 10000165| 0| VEDTÆGT_SENESTE| date|2015-02-09T21:00:...| 2000-03-24|2000-03-24|2006-05-18|
| 10000211| 0|ARKIV_REGISTRERIN...| string|2015-02-10T00:00:...| A/S261735|1999-09-29| null|
| 10000211| 0| FORMÅL| string|2015-02-10T00:00:...|Selskabets formål...|2010-05-28| null|
| 10000211| 0|FØRSTE_REGNSKABSP...| date|2015-02-10T00:00:...| 2000-12-31|1999-09-29| null|
| 10000211| 0|FØRSTE_REGNSKABSP...| date|2015-02-10T00:00:...| 1999-09-29|1999-09-29| null|
| 10000211| 0| KAPITAL| decimal|2015-02-10T00:00:...| 500000.00|1999-09-29| null|
| 10000211| 0| KAPITALVALUTA| string|2015-02-10T00:00:...| DKK|1999-09-29| null|
| 10000211| 0| NAVN_IDENTITET| string|2015-02-10T00:00:...|UGEAVISEN FOR BIL...|1999-09-29|2001-08-08|
| 10000211| 0| NAVN_IDENTITET| string|2015-02-10T00:00:...|KOMPLEMENTARSELSK...|2001-08-09| null|
| 10000211| 0| PSEUDOCVRNR| boolean|2015-02-10T00:00:...| false|1999-09-29| null|
| 10000211| 0| REGNSKABSÅR_SLUT| gMonthDay|2015-02-10T00:00:...| --12-31|1999-09-29| null|
| 10000211| 0| REGNSKABSÅR_START| gMonthDay|2015-02-10T00:00:...| --01-01|1999-09-29| null|
| 10000211| 0| REVISION_FRAVALGT| boolean|2015-03-12T00:00:...| false|1999-09-29| null|
| 10000211| 0| TEGNINGSREGEL| string|2015-02-10T00:00:...|Selskabet tegnes ...|2010-05-28| null|
| 10000211| 0| VEDTÆGT_SENESTE| date|2015-02-10T00:00:...| 2010-05-28|2010-05-28| null|
| 10000238| 0|ARKIV_REGISTRERIN...| string|2015-02-09T21:00:...| A/S110444|1985-11-15|1989-07-05|
| 10000238| 0| FORMÅL| string|2015-02-09T21:00:...|Selskabets formål...|1985-11-15|1989-07-05|
| 10000238| 0|FØRSTE_REGNSKABSP...| date|2015-02-09T21:00:...| 1986-04-30|1985-11-15|1989-07-05|
| 10000238| 0|FØRSTE_REGNSKABSP...| date|2015-02-09T21:00:...| 1985-11-15|1985-11-15|1989-07-05|
| 10000238| 0| KAPITAL| decimal|2015-02-09T21:00:...| 300000.00|1987-08-30|1989-07-05|
| 10000238| 0| KAPITALVALUTA| string|2015-02-09T21:00:...| DKK|1987-08-30|1989-07-05|
| 10000238| 0| NAVN_IDENTITET| string|2015-02-09T21:00:...|SANDERS KØKKENGÅR...|1985-11-15|1989-07-11|
| 10000238| 0| PSEUDOCVRNR| boolean|2015-02-09T21:00:...| true|1985-11-15|1989-07-05|
| 10000238| 0| REGNSKABSÅR_SLUT| gMonthDay|2015-02-09T21:00:...| --04-30|1985-11-15|1989-07-05|
| 10000238| 0| REGNSKABSÅR_START| gMonthDay|2015-02-09T21:00:...| --05-01|1985-11-15|1989-07-05|
| 10000238| 0| TEGNINGSREGEL| string|2015-02-09T21:00:...|Selskabet tegnes ...|1985-11-15|1989-07-05|
| 10000238| 0| VEDTÆGT_SENESTE| date|2015-02-09T21:00:...| 1985-11-15|1985-11-15|1989-07-05|
| 10000254| 0|ARKIV_REGISTRERIN...| string|2015-02-10T00:00:...| A/S261668|1999-10-12| null|
| 10000254| 0| FORMÅL| string|2015-02-10T00:00:...|Selskabets formål...|2010-06-02| null|
| 10000254| 0|FØRSTE_REGNSKABSP...| date|2015-02-10T00:00:...| 2000-12-31|1999-10-12| null|
+---------+---------+--------------------+----------+--------------------+--------------------+----------+----------+
only showing top 100 rows
In [11]:
showAttributes = attributesWithValueAndPeriodDf.groupBy(["type"]).count().orderBy("count").collect()
for v in showAttributes:
print("type: {} count: {}".format(v[0],v[1]))
type: SØV_IEF_TILSYN_TEKST count: 2
type: SØV_IEF_TILSYN count: 13
type: REGNSKABSÅR_FRITEKST count: 55
type: MYNDIGHED_ANDEN count: 99
type: STIFTET_FØR_1900 count: 144
type: SOCIAL_ØKONOMISK_VIRKSOMHED count: 193
type: OFFENTLIG_EJERBOG count: 275
type: KONCESSIONSDATO count: 351
type: BØRSNOTERET count: 355
type: STADFÆSTELSESDATO count: 501
type: STADFÆSTET_AF count: 501
type: FINANSIEL_DELTYPE count: 755
type: TILSYN_KATEGORI count: 785
type: TILLADELSESDATO_FONDSMYNDIGHED count: 1963
type: FINANSIELT_FORMÅL count: 1976
type: REGISTRERING_LIKVIDATION count: 3264
type: OPLØSNINGSTRUSSEL_SENESTE count: 4265
type: OMFATTET_AF_LOV_OM_HVIDVASK_OG_TERRORFINANSIERING count: 8648
type: GENOPTAGELSE_TVANGSOPLØSNING count: 13496
type: STATSLIG_VIRK count: 39185
type: KAPITAL_DELVIST count: 39393
type: OMLÆGNINGSPERIODE_START count: 47807
type: OMLÆGNINGSPERIODE_SLUT count: 47810
type: KAPITALKLASSER count: 62855
type: EJERREGISTRERING_UNDER_5_PROCENT count: 110211
type: REVISION_FRAVALGT count: 307594
type: FØRSTE_REGNSKABSPERIODE_START count: 488616
type: FØRSTE_REGNSKABSPERIODE_SLUT count: 488820
type: ARKIV_REGISTRERINGSNUMMER count: 504468
type: REGNSKABSÅR_START count: 516886
type: REGNSKABSÅR_SLUT count: 516888
type: PSEUDOCVRNR count: 521839
type: FORMÅL count: 524569
type: TEGNINGSREGEL count: 531251
type: VEDTÆGT_SENESTE count: 542108
type: KAPITALVALUTA count: 688207
type: KAPITAL count: 696457
type: NAVN_IDENTITET count: 990311
In [11]:
#udtrækning af kapital, gyldighedsdatoer og cvr
valuesFromAttributesDf = (attributesWithValueAndPeriodDf
.filter(F.col("type") == "KAPITAL")
.select("cvrNummer","vaerdi","gyldigFra",F.coalesce("gyldigTil",F.lit(F.current_date())).alias("gyldigTil"))
.withColumn(colName="varighed",col=F.datediff(end="gyldigTil",start="gyldigFra")))
In [13]:
print(valuesFromAttributesDf.select("cvrNummer").distinct().count())
499698
In [12]:
valuesFromAttributesDf.show(20)
+---------+----------+----------+----------+--------+
|cvrNummer| vaerdi| gyldigFra| gyldigTil|varighed|
+---------+----------+----------+----------+--------+
| 26452635| 170011.00|2003-02-24|2010-01-21| 2523|
| 26452635| 140080.00|2002-01-08|2002-06-20| 163|
| 26452635| 145000.00|2002-06-21|2002-09-01| 72|
| 26452635| 155900.00|2002-09-02|2003-02-23| 174|
| 15157984| 80000.00|1991-05-08|1995-09-27| 1603|
| 15111038|1000000.00|1997-11-06|1998-06-30| 236|
| 15111038| 300000.00|1991-04-04|1991-10-08| 187|
| 15111038|7000000.00|1991-10-09|1997-11-05| 2219|
| 14731644| 300000.00|1990-11-20|1993-08-04| 988|
| 14728147| 500000.00|1992-06-30|1996-02-07| 1317|
| 14728147| 300000.00|1990-11-20|1992-06-29| 587|
| 15500387| 80000.00|1991-09-02|1995-01-10| 1226|
| 15501332| 80000.00|1991-09-02|1995-11-22| 1542|
| 15500972| 500000.00|1992-01-08|1995-02-22| 1141|
| 15500972| 300000.00|1991-09-02|1992-01-07| 127|
| 15276703| 300000.00|1991-07-05|1993-11-12| 861|
| 13917698| 325000.00|1991-10-23|1998-10-13| 2547|
| 13917698| 80000.00|1990-02-19|1991-08-08| 535|
| 13917698| 300000.00|1991-08-09|1991-10-22| 74|
| 12073097| 560000.00|1988-09-08|1994-02-18| 1989|
+---------+----------+----------+----------+--------+
only showing top 20 rows
In [13]:
#compute cummulative difference for each cvrnummer between vaerdi
windowSpec = (Window
.partitionBy(valuesFromAttributesDf["cvrNummer"])
.orderBy(valuesFromAttributesDf["gyldigFra"])
)
windowSpec.rowsBetween(-1,0)
valuesAndCumDifDf = (valuesFromAttributesDf
.withColumn(colName="CumlativeDiff",col=F.col("vaerdi")-F.lag("vaerdi").over(windowSpec))).cache()
In [16]:
valuesAndCumDifDf.show(250)
+---------+------------+----------+----------+--------+-------------+
|cvrNummer| vaerdi| gyldigFra| gyldigTil|varighed|CumlativeDiff|
+---------+------------+----------+----------+--------+-------------+
| 10017025| 500000.00|2000-02-01|2002-05-02| 821| null|
| 10019052| 125000.00|2000-02-23|2002-05-27| 824| null|
| 10026113| 125000.00|2000-06-23|2004-09-27| 1557| null|
| 10027926| 600000.00|2000-06-08|2001-01-15| 221| null|
| 10029325|100000000.00|2000-06-07|2000-12-17| 193| null|
| 10029325|105000000.00|2000-12-18|2007-12-19| 2557| 5000000.0|
| 10039983| 80000.00|1987-08-30|1993-03-24| 2033| null|
| 10040523| 125000.00|2000-07-01|2001-05-20| 323| null|
| 10040523| 500000.00|2001-05-21|2005-10-14| 1607| 375000.0|
| 10040523| 500000.00|2005-10-15|2006-06-29| 257| 0.0|
| 10040523| 1000000.00|2006-06-30|2013-04-04| 2470| 500000.0|
| 10056535| 80000.00|1987-08-30|1992-08-07| 1804| null|
| 10057426| 600000.00|2000-12-20|2011-07-31| 3875| null|
| 10057426| 646154.00|2011-08-01|2017-01-05| 1984| 46154.0|
| 10063558| 125000.00|2000-12-29|2008-08-19| 2790| null|
| 10065798| 80000.00|1987-08-30|1996-02-25| 3101| null|
| 10065798| 125000.00|1996-02-26|2002-01-08| 2143| 45000.0|
| 10071526| 125000.00|2001-05-10|2001-09-26| 139| null|
| 10071526| 250000.00|2001-09-27|2002-02-05| 131| 125000.0|
| 10071526| 125000.00|2002-02-06|2017-01-05| 5447| -125000.0|
| 10079322| 125000.00|2001-12-05|2012-05-10| 3809| null|
| 10079322| 2000000.00|2012-05-11|2017-01-05| 1700| 1875000.0|
| 10081653| 125000.00|2002-05-07|2017-01-05| 5357| null|
| 10089514| 149682.00|2002-06-28|2017-01-05| 5305| null|
| 10090458| 500000.00|2002-07-02|2014-01-01| 4201| null|
| 10091519| 125000.00|2002-06-28|2017-01-05| 5305| null|
| 10091713| 500000.00|2002-07-01|2005-02-23| 968| null|
| 10102863| 125000.00|2003-01-08|2017-01-05| 5111| null|
| 10108624| 125000.00|2003-01-01|2015-02-06| 4419| null|
| 10114411| 320000.00|1987-08-30|1996-12-19| 3399| null|
| 10114411| 500000.00|1996-12-20|2007-02-12| 3706| 180000.0|
| 10123127| 125000.00|2002-12-09|2017-01-05| 5141| null|
| 10127351| 1000000.00|2002-12-10|2017-01-05| 5140| null|
| 10128587| 80000.00|1987-08-30|1996-12-22| 3402| null|
| 10128587| 125000.00|1996-12-23|2005-03-29| 3018| 45000.0|
| 10128587| 650000.00|2005-03-30|2006-11-22| 602| 525000.0|
| 10129486| 80000.00|1987-08-30|1997-03-04| 3474| null|
| 10139767| 125000.00|2003-06-25|2004-11-18| 512| null|
| 10139767| 125000.00|2004-11-19|2008-08-05| 1355| 0.0|
| 10145619| 125000.00|2003-07-24|2017-01-05| 4914| null|
| 10150078| 90000.00|1987-08-30|1990-09-04| 1101| null|
| 10150124| 125000.00|2003-09-08|2013-10-10| 3685| null|
| 10156521| 125000.00|2003-12-17|2013-04-01| 3393| null|
| 10200288| 300000.00|1987-08-30|1996-12-03| 3383| null|
| 10200288| 500000.00|1996-12-04|2015-01-22| 6623| 200000.0|
| 10212782| 80000.00|1987-08-30|1997-02-24| 3466| null|
| 10212782| 125000.00|1997-02-25|1997-11-04| 252| 45000.0|
| 10219698| 19000.00|1987-03-12|1987-08-29| 170| null|
| 10219698| 99000.00|1987-08-30|1993-09-02| 2195| 80000.0|
| 10240972| 90000.00|1987-08-30|1987-08-17| -13| null|
| 10269636| 350000.00|1987-08-30|1990-07-03| 1038| null|
| 10279836| 80000.00|1987-08-30|1995-11-27| 3011| null|
| 10282578| 80000.00|1987-08-30|1988-09-20| 387| null|
| 10282578| 80000.00|1988-09-21|1994-02-03| 1961| 0.0|
| 10282578| 200000.00|1994-02-04|2004-07-08| 3807| 120000.0|
| 10293537| 80000.00|1987-08-30|2000-01-10| 4516| null|
| 10311055| 30000.00|1987-08-30|1993-06-15| 2116| null|
| 10315107| 80000.00|1987-08-30|1991-01-08| 1227| null|
| 10315107| 300000.00|1991-01-09|1998-01-05| 2553| 220000.0|
| 10324602| 80000.00|1987-08-30|1991-01-31| 1250| null|
| 10325978| 300000.00|1987-08-30|1990-08-07| 1073| null|
| 10329744| 80000.00|1987-08-30|1992-08-20| 1817| null|
| 10329744| 200000.00|1992-08-21|1998-06-22| 2131| 120000.0|
| 10351235| 3000000.00|1987-08-30|2017-01-05| 10721| null|
| 10366380| 300000.00|1987-08-30|1996-06-27| 3224| null|
| 10366380| 500000.00|1996-06-28|2004-07-30| 2954| 200000.0|
| 10368901| 1200000.00|1987-08-20|1987-08-29| 9| null|
| 10368901| 1500000.00|1987-08-30|1993-01-20| 1970| 300000.0|
| 10368901| 1600000.00|1993-01-21|1994-02-01| 376| 100000.0|
| 10368901| 26600000.00|1994-02-02|1997-06-26| 1240| 2.5E7|
| 10397707| 80000.00|1987-08-30|1991-01-22| 1241| null|
| 10412986| 200000.00|1987-08-30|1989-10-08| 770| null|
| 10412986| 80000.00|1989-10-09|1995-10-29| 2211| -120000.0|
| 10412986| 800000.00|1995-10-30|2005-08-10| 3572| 720000.0|
| 10417244| 300000.00|1987-08-30|1996-11-14| 3364| null|
| 10423872| 80000.00|1987-08-30|1989-10-13| 775| null|
| 10427088| 80000.00|1987-08-30|1993-10-05| 2228| null|
| 10427088| 200000.00|1993-10-06|2002-11-28| 3340| 120000.0|
| 10434408| 80000.00|1987-08-30|1993-01-28| 1978| null|
| 10471974| 80000.00|1987-08-30|1996-11-26| 3376| null|
| 10478375| 80000.00|1987-08-30|1987-10-25| 56| null|
| 10478375| 600000.00|1987-10-26|1996-09-19| 3251| 520000.0|
| 10478375| 400000.00|1996-09-20|1999-07-27| 1040| -200000.0|
| 10480493| 80000.00|1987-08-30|1989-03-29| 577| null|
| 10481171| 80000.00|1987-08-30|1996-08-12| 3270| null|
| 10481171| 125000.00|1996-08-13|1999-01-12| 882| 45000.0|
| 10481171| 125000.00|1999-01-13|2005-04-19| 2288| 0.0|
| 10500907| 300000.00|1987-08-30|1988-12-26| 484| null|
| 10500907| 1000000.00|1988-12-27|1992-09-14| 1357| 700000.0|
| 10509246| 4700000.00|1987-08-12|1987-08-29| 17| null|
| 10509246| 5000000.00|1987-08-30|1990-11-22| 1180| 300000.0|
| 10509246| 6500000.00|1990-11-23|1994-12-14| 1482| 1500000.0|
| 10517575| 80000.00|1987-08-30|1994-12-11| 2660| null|
| 10517575| 200000.00|1994-12-12|2017-01-05| 8060| 120000.0|
| 10522749| 80000.00|1987-08-30|1996-12-26| 3406| null|
| 10522749| 200000.00|1996-12-27|2017-01-05| 7314| 120000.0|
| 10563585| 80000.00|1987-08-30|1993-01-13| 1963| null|
| 10576695| 80000.00|1987-08-30|1989-10-12| 774| null|
| 10577675| 80000.00|1987-08-30|1989-02-08| 528| null|
| 10577675| 160000.00|1989-02-09|1993-02-01| 1453| 80000.0|
| 10577675| 160000.00|1993-02-02|1995-12-17| 1048| 0.0|
| 10577675| 200000.00|1995-12-18|2004-04-01| 3027| 40000.0|
| 10597390| 80000.00|1987-08-30|1995-07-18| 2879| null|
| 10597390| 200000.00|1995-07-19|2017-01-05| 7841| 120000.0|
| 10606683| 80000.00|1987-08-30|1990-12-11| 1199| null|
| 10606683| 500000.00|1990-12-12|1994-11-15| 1434| 420000.0|
| 10606683| 1500000.00|1994-11-16|1995-07-13| 239| 1000000.0|
| 10606683| 2250000.00|1995-07-14|2017-01-05| 7846| 750000.0|
| 10609747| 2520000.00|1987-05-13|1987-08-29| 108| null|
| 10609747| 2600000.00|1987-08-30|1994-12-08| 2657| 80000.0|
| 10609747| 1100000.00|1994-12-09|1996-02-26| 444| -1500000.0|
| 10613779| 300000.00|1987-08-30|1992-01-05| 1589| null|
| 10613779| 300000.00|1992-01-06|1993-01-20| 380| 0.0|
| 10613779| 1000000.00|1993-01-21|1994-01-31| 375| 700000.0|
| 10613779| 1100000.00|1994-02-01|2000-12-28| 2522| 100000.0|
| 10613779| 1200000.00|2000-12-29|2002-06-25| 543| 100000.0|
| 10613779| 1300000.00|2002-06-26|2017-01-05| 5307| 100000.0|
| 10615445| 80000.00|1987-08-30|1996-12-02| 3382| null|
| 10615445| 200000.00|1996-12-03|1998-09-16| 652| 120000.0|
| 10629098| 185000.00|1987-03-30|2017-01-05| 10874| null|
| 10635691| 300000.00|1987-03-02|1996-07-11| 3419| null|
| 10635691| 500000.00|1996-07-12|2017-01-05| 7482| 200000.0|
| 10649897| 80000.00|1987-08-30|1995-10-08| 2961| null|
| 10649897| 200000.00|1995-10-09|1998-09-16| 1073| 120000.0|
| 10649897| 400000.00|1998-09-17|2013-09-30| 5492| 200000.0|
| 10657288| 300000.00|1987-08-30|1991-06-28| 1398| null|
| 10661846| 80000.00|1987-08-30|1993-11-11| 2265| null|
| 10663075| 80000.00|1987-08-30|1995-12-14| 3028| null|
| 10664977| 80000.00|1987-08-30|1998-04-30| 3896| null|
| 10664977| 125000.00|1998-05-01|2005-01-14| 2450| 45000.0|
| 10677238| 80000.00|1987-08-30|1991-05-27| 1366| null|
| 10711479| 80000.00|1987-08-30|1996-11-25| 3375| null|
| 10711479| 200000.00|1996-11-26|2003-04-23| 2339| 120000.0|
| 10725984| 80000.00|1987-08-30|1994-09-29| 2587| null|
| 10738601| 80000.00|1987-09-01|1991-06-18| 1386| null|
| 10739276| 1000000.00|1987-08-30|1999-07-05| 4327| null|
| 10785383| 80000.00|1987-08-30|1994-04-07| 2412| null|
| 10788099| 80000.00|1987-08-30|1993-08-10| 2172| null|
| 10793246| 300000.00|1987-08-30|1993-06-08| 2109| null|
| 10809673| 80000.00|1987-08-30|1997-02-24| 3466| null|
| 10829399| 80000.00|1987-09-03|1996-10-24| 3339| null|
| 10829399| 200000.00|1996-10-25|2006-02-02| 3387| 120000.0|
| 10829399| 1000000.00|2006-02-03|2010-12-22| 1783| 800000.0|
| 10837308| 80000.00|1987-08-30|1996-12-08| 3388| null|
| 10837308| 125000.00|1996-12-09|2017-01-05| 7332| 45000.0|
| 10901545| 80000.00|1987-08-30|1989-04-04| 583| null|
| 10909295| 100000.00|1987-10-29|1995-03-22| 2701| null|
| 10909295| 200000.00|1995-03-23|2017-01-05| 7959| 100000.0|
| 10964709| 80000.00|1987-09-17|1990-02-23| 890| null|
| 11007805| 300000.00|1987-08-31|1997-01-16| 3426| null|
| 11007805| 500000.00|1997-01-17|2005-01-30| 2935| 200000.0|
| 11007805| 510000.00|2005-01-31|2006-09-27| 604| 10000.0|
| 11049702| 80000.00|1987-08-30|1997-10-29| 3713| null|
| 11049702| 500000.00|1997-10-30|2006-04-03| 3077| 420000.0|
| 11050808| 80000.00|1987-08-28|1996-12-22| 3404| null|
| 11050808| 200000.00|1996-12-23|2017-01-05| 7318| 120000.0|
| 11156177| 300000.00|1987-08-30|1988-11-24| 452| null|
| 11156177| 10000000.00|1988-11-25|1989-05-15| 171| 9700000.0|
| 11156177|100000000.00|1989-05-16|1995-09-13| 2311| 9.0E7|
| 11160131| 80000.00|1987-10-01|1990-05-18| 960| null|
| 11204708| 80000.00|1987-10-12|1991-01-11| 1187| null|
| 11214185| 80000.00|1987-10-06|1996-03-22| 3090| null|
| 11243800| 80000.00|1987-10-06|1993-03-31| 2003| null|
| 11320902| 80000.00|1987-10-02|1993-12-14| 2265| null|
| 11320902| 1000000.00|1993-12-15|1995-08-30| 623| 920000.0|
| 11388086| 80000.00|1987-09-07|1992-06-29| 1757| null|
| 11508294| 80000.00|1987-11-03|1990-01-28| 817| null|
| 11508294| 115000.00|1990-01-29|1995-09-06| 2046| 35000.0|
| 11508294| 200000.00|1995-09-07|1997-02-19| 531| 85000.0|
| 11518273| 80000.00|1987-11-04|1995-11-07| 2925| null|
| 11518273| 500000.00|1995-11-08|2004-09-12| 3231| 420000.0|
| 11518273| 750000.00|2004-09-13|2012-04-19| 2775| 250000.0|
| 11534104| 80000.00|1987-11-04|1996-10-01| 3254| null|
| 11534104| 125000.00|1996-10-02|2001-12-05| 1890| 45000.0|
| 11543308| 30000.00|1987-11-09|1991-07-30| 1359| null|
| 11543308| 90000.00|1991-07-31|1994-03-23| 966| 60000.0|
| 11545637| 80000.00|1987-11-04|1998-03-04| 3773| null|
| 11547184| 80000.00|1987-11-04|1997-01-22| 3367| null|
| 11547184| 125000.00|1997-01-23|2017-01-05| 7287| 45000.0|
| 11559948| 300000.00|1987-11-02|1996-06-27| 3160| null|
| 11559948| 500000.00|1996-06-28|1999-11-09| 1229| 200000.0|
| 11734294| 80000.00|1987-11-25|1991-01-23| 1155| null|
| 11759149| 300000.00|1987-12-02|1988-07-27| 238| null|
| 11759149| 500000.00|1988-07-28|1996-11-07| 3024| 200000.0|
| 11759149| 500000.00|1996-11-08|2017-01-05| 7363| 0.0|
| 11761747| 300000.00|1987-12-17|1998-02-09| 3707| null|
| 11784631| 80000.00|1988-01-11|1993-02-08| 1855| null|
| 11784631| 200000.00|1993-02-09|1999-02-25| 2207| 120000.0|
| 11832172| 80000.00|1988-01-15|1996-11-27| 3239| null|
| 11832172| 125000.00|1996-11-28|2017-01-05| 7343| 45000.0|
| 11872190| 300000.00|1988-02-24|1992-12-08| 1749| null|
| 11872190| 500000.00|1992-12-09|1999-11-30| 2547| 200000.0|
| 11888739| 80000.00|1988-01-27|1995-10-26| 2829| null|
| 11931332| 80000.00|1988-03-08|1997-10-17| 3510| null|
| 11937179| 80000.00|1988-02-10|1997-11-13| 3564| null|
| 11937179| 125000.00|1997-11-14|2017-01-05| 6992| 45000.0|
| 11967876| 80000.00|1988-02-11|1990-10-17| 979| null|
| 11968155| 125000.00|2001-10-26|2014-06-22| 4622| null|
| 11968155| 600000.00|2014-06-23|2017-01-05| 927| 475000.0|
| 11974937| 80000.00|1988-03-14|1992-02-28| 1446| null|
| 11975380| 80000.00|1988-03-14|1988-06-28| 106| null|
| 11975380| 300000.00|1988-06-29|1989-10-11| 469| 220000.0|
| 11975380| 1300000.00|1989-10-12|1998-01-08| 3010| 1000000.0|
| 11981038| 80000.00|1988-03-14|1994-08-25| 2355| null|
| 11991777| 300000.00|1988-03-28|1997-03-14| 3273| null|
| 12009690| 80000.00|1988-03-14|1994-03-28| 2205| null|
| 12009690| 200000.00|1994-03-29|2010-08-09| 5977| 120000.0|
| 12009690| 250000.00|2010-08-10|2017-01-05| 2340| 50000.0|
| 12029187| 80000.00|1988-04-06|1996-12-22| 3182| null|
| 12029187| 200000.00|1996-12-23|2017-01-05| 7318| 120000.0|
| 12031491| 80000.00|1988-04-11|1990-01-24| 653| null|
| 12031491| 300000.00|1990-01-25|1991-11-27| 671| 220000.0|
| 12031491| 500000.00|1991-11-28|1997-10-22| 2155| 200000.0|
| 12049196| 90000.00|1988-06-23|1991-04-11| 1022| null|
| 12056990| 80000.00|1988-04-08|2002-05-23| 5158| null|
| 12080417| 150000.00|2003-03-12|2003-09-08| 180| null|
| 12080417| 200000.00|2003-09-09|2010-07-30| 2516| 50000.0|
| 12103387| 80000.00|1988-05-11|1991-02-27| 1022| null|
| 12108281| 80000.00|1988-05-18|1989-03-13| 299| null|
| 12108281| 300000.00|1989-03-14|1991-10-08| 938| 220000.0|
| 12108281| 300000.00|1991-10-09|1993-12-17| 800| 0.0|
| 12120184| 80000.00|1988-05-11|1992-07-29| 1540| null|
| 12139640| 80000.00|1987-08-30|1994-08-03| 2530| null|
| 12143737| 80000.00|1988-06-01|1997-01-07| 3142| null|
| 12143737| 125000.00|1997-01-08|2002-09-15| 2076| 45000.0|
| 12143737| 1000000.00|2002-09-16|2011-03-01| 3088| 875000.0|
| 12144040| 80000.00|1988-05-31|1997-10-16| 3425| null|
| 12155816| 125000.00|2004-01-26|2017-01-05| 4728| null|
| 12184573| 80000.00|1988-06-07|1992-10-02| 1578| null|
| 12211937| 80000.00|1988-06-28|1995-10-09| 2659| null|
| 12211937| 200000.00|1995-10-10|2009-05-12| 4963| 120000.0|
| 12235437| 80000.00|1988-05-11|1996-11-26| 3121| null|
| 12246374| 80000.00|1988-07-04|1993-11-04| 1949| null|
| 12246374| 200000.00|1993-11-05|2005-02-14| 4119| 120000.0|
| 12246609| 80000.00|1988-07-04|1993-12-09| 1984| null|
| 12246609| 200000.00|1993-12-10|1994-09-28| 292| 120000.0|
| 12246609| 1200000.00|1994-09-29|2001-05-28| 2433| 1000000.0|
| 12294948| 80000.00|1988-07-25|1990-12-10| 868| null|
| 12294948| 80000.00|1990-12-11|1994-04-08| 1214| 0.0|
| 12317743| 80000.00|1988-08-01|1990-05-02| 639| null|
| 12317743| 84000.00|1990-05-03|1993-10-07| 1253| 4000.0|
| 12317743| 504000.00|1993-10-08|1995-11-13| 766| 420000.0|
| 12339283| 80000.00|1988-08-01|1991-06-26| 1059| null|
| 12352700| 80000.00|1988-08-11|1996-01-31| 2729| null|
| 12387997| 80000.00|1988-08-01|1991-01-07| 889| null|
| 12406657| 125000.00|2007-08-23|2017-01-05| 3423| null|
| 12407718| 200000.00|1997-01-29|2017-01-05| 7281| null|
| 12427190| 100000.00|1988-10-12|1998-02-12| 3410| null|
| 12427190| 125000.00|1998-02-13|2017-01-05| 6901| 25000.0|
| 12428405| 90000.00|1988-12-21|1994-12-20| 2190| null|
+---------+------------+----------+----------+--------+-------------+
only showing top 250 rows
In [18]:
#check who is væksting in year x
vaekst2015Df = (valuesAndCumDifDf
.where(condition=F.year("gyldigFra") == 2015)
.na.drop(subset="CumlativeDiff")
.orderBy(["CumlativeDiff","cvrNummer"],ascending=[0,0]))
vaekst2015Df.show()
+---------+-------------+----------+----------+--------+--------------+
|cvrNummer| vaerdi| gyldigFra| gyldigTil|varighed| CumlativeDiff|
+---------+-------------+----------+----------+--------+--------------+
| 10519608|8045000000.00|2015-02-05|2017-01-09| 704| 2.0E9|
| 12867336|9508000000.00|2015-06-19|2017-01-09| 570| 1.2E9|
| 25679288|2000000000.00|2015-03-02|2017-01-09| 679| 1.0E9|
| 36533846|1000000000.00|2015-04-07|2017-01-09| 643| 9.99E8|
| 22460218| 957543745.56|2015-07-13|2015-12-14| 154|9.5026374556E8|
| 29223009| 750000000.00|2015-05-18|2015-11-10| 176| 5.75E8|
| 34486816| 500000000.00|2015-12-22|2017-01-09| 384| 4.995E8|
| 32345794| 500000000.00|2015-02-01|2017-01-09| 708| 4.995E8|
| 36058552|3062554419.00|2015-09-01|2017-01-09| 496| 3.42054419E8|
| 34720819| 303655700.00|2015-02-20|2015-09-27| 219| 3.035536E8|
| 73349613| 720000000.00|2015-05-13|2017-01-09| 607| 3.0E8|
| 32081576| 361218000.00|2015-10-06|2017-01-09| 461| 2.46E8|
| 33971443| 500000000.00|2015-05-18|2015-12-03| 199| 2.4E8|
| 36685719| 236450000.00|2015-07-03|2015-09-30| 89| 2.3595E8|
| 36020873| 236450000.00|2015-07-09|2015-09-30| 83| 2.3595E8|
| 32127711| 300000000.00|2015-01-29|2017-01-09| 711| 2.34E8|
| 11814913|2728000000.00|2015-04-09|2017-01-09| 641| 2.05E8|
| 36937661| 200000000.00|2015-08-25|2017-01-09| 503| 1.995E8|
| 36700386| 200000000.00|2015-05-29|2017-01-09| 591| 1.995E8|
| 35209948| 400540000.00|2015-06-15|2017-01-09| 574| 1.75E8|
+---------+-------------+----------+----------+--------+--------------+
only showing top 20 rows
In [17]:
print(vaekst2015Df.select("cvrNummer").distinct().count())
5526
In [18]:
#creating deltager as a table
deltagerRelationDf = createNextLayerTable(cvrDf,["cvrNummer"],"deltagerRelation","virksomhed")
#deltagerRelationDf.printSchema()
deltagerRelationCol = deltagerRelationDf.columns
print(deltagerRelationCol)
deltagerRelationCol.remove("deltager")
#deltagerRelationDf.show()
deltagerDf = expandSubCols(deltagerRelationDf,"deltager")
deltagerDf.show(1)
#deltagerDf.printSchema()
deltagerDf = deltagerDf.drop("kontorsteder").drop("organisationer")
deltagerCols = deltagerDf.columns
deltagerCols.remove("navne")
deltagerNamesDf = createNextLayerTable(df=deltagerDf,nonExplodedColumns=deltagerCols,explodedColumn="navne")
deltagerNamesDf.show(10)
['cvrNummer', 'deltager', 'kontorsteder', 'organisationer']
+---------+------------+--------------------+--------------------+------------+----------+-----------------+--------------------+--------------------+
|cvrNummer|kontorsteder| organisationer| beliggenhedsadresse|enhedsNummer|enhedstype|forretningsnoegle| navne| sidstIndlaest|
+---------+------------+--------------------+--------------------+------------+----------+-----------------+--------------------+--------------------+
| 27850367| []|[[WrappedArray([0...|[[null,null,null,...| 4000218628| PERSON| null|[[Gitte Heegaard ...|2015-12-10T04:38:...|
+---------+------------+--------------------+--------------------+------------+----------+-----------------+--------------------+--------------------+
only showing top 1 row
+---------+--------------------+------------+----------+-----------------+--------------------+--------------------+--------------------+--------------------+
|cvrNummer| beliggenhedsadresse|enhedsNummer|enhedstype|forretningsnoegle| sidstIndlaest| navn| periode| sidstOpdateret|
+---------+--------------------+------------+----------+-----------------+--------------------+--------------------+--------------------+--------------------+
| 27850367|[[null,null,null,...| 4000218628| PERSON| null|2015-12-10T04:38:...| Gitte Heegaard Amby| [null,null]| null|
| 36376554|[[null,null,Kongs...| 4006222194| PERSON| null|2015-12-10T04:38:...|Anna-Sophie Schomann| [null,null]| null|
| 36376554|[[null,null,null,...| 4006222195| PERSON| null|2015-12-10T01:20:...| Elsebet Rasmussen| [null,null]| null|
| 26452635|[[null,null,Himme...| 2653504|VIRKSOMHED| 10056322|2015-06-23T15:50:...|CAT-SYMBION INNOV...|[2001-01-25,2009-...|2013-11-22T19:12:...|
| 26452635|[[null,null,Himme...| 2653504|VIRKSOMHED| 10056322|2015-06-23T15:50:...| KR 279 A/S|[2000-12-01,2001-...|2000-12-15T05:14:...|
| 26452635|[[null,null,Himme...| 2678531|VIRKSOMHED| 10073014|2015-06-23T15:55:...| CAT SEED A/S|[2001-05-07,2014-...|2014-02-26T08:05:...|
| 26452635|[[null,null,null,...| 5105513|VIRKSOMHED| 26052394|2015-12-02T23:00:...| JUT NR. 2180 A/S|[2001-04-01,2001-...|2001-05-30T10:43:...|
| 26452635|[[null,null,null,...| 5105513|VIRKSOMHED| 26052394|2015-12-02T23:00:...|EASY BUSINESS STA...|[2001-08-15,2003-...|2001-09-06T13:50:...|
| 26452635|[[null,null,null,...| 5105513|VIRKSOMHED| 26052394|2015-12-02T23:00:...|ENERGY BUSINESS A...|[2003-02-28,2005-...|2003-04-14T10:01:...|
| 26452635|[[null,null,null,...| 5105513|VIRKSOMHED| 26052394|2015-12-02T23:00:...|EY SERVICE STATSA...|[2005-06-29,2009-...|2005-06-30T10:44:...|
+---------+--------------------+------------+----------+-----------------+--------------------+--------------------+--------------------+--------------------+
only showing top 10 rows
In [19]:
deltagerNamesDf.printSchema()
root
|-- cvrNummer: long (nullable = true)
|-- beliggenhedsadresse: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- bogstavFra: string (nullable = true)
| | |-- bogstavTil: string (nullable = true)
| | |-- bynavn: string (nullable = true)
| | |-- conavn: string (nullable = true)
| | |-- etage: string (nullable = true)
| | |-- fritekst: string (nullable = true)
| | |-- husnummerFra: long (nullable = true)
| | |-- husnummerTil: long (nullable = true)
| | |-- kommune: struct (nullable = true)
| | | |-- kommuneKode: long (nullable = true)
| | | |-- kommuneNavn: string (nullable = true)
| | | |-- periode: struct (nullable = true)
| | | | |-- gyldigFra: string (nullable = true)
| | | | |-- gyldigTil: string (nullable = true)
| | | |-- sidstOpdateret: string (nullable = true)
| | |-- landekode: string (nullable = true)
| | |-- periode: struct (nullable = true)
| | | |-- gyldigFra: string (nullable = true)
| | | |-- gyldigTil: string (nullable = true)
| | |-- postboks: string (nullable = true)
| | |-- postdistrikt: string (nullable = true)
| | |-- postnummer: long (nullable = true)
| | |-- sidedoer: string (nullable = true)
| | |-- sidstOpdateret: string (nullable = true)
| | |-- vejkode: long (nullable = true)
| | |-- vejnavn: string (nullable = true)
|-- enhedsNummer: long (nullable = true)
|-- enhedstype: string (nullable = true)
|-- forretningsnoegle: long (nullable = true)
|-- sidstIndlaest: string (nullable = true)
|-- navn: string (nullable = true)
|-- periode: struct (nullable = true)
| |-- gyldigFra: string (nullable = true)
| |-- gyldigTil: string (nullable = true)
|-- sidstOpdateret: string (nullable = true)
In [53]:
expandedHovedBrancheWithDateDf.columns
Out[53]:
['cvrNummer',
'branchekode',
'branchetekst',
'sidstOpdateret',
'gyldigFra',
'gyldigTil']
In [62]:
brancheList = ["cvrNummer","hovedbranche"]#"bibranche1","bibranche2","bibranche3"
companyBrancherDf = (cvrDf
.select([F.col("virksomhed."+f) for f in brancheList]))
#companyBrancherDf.show()
brancheList.remove("hovedbranche")
expandedHovedBrancheDf = createNextLayerTable(df=companyBrancherDf,nonExplodedColumns=brancheList,explodedColumn="hovedbranche")
expandedHovedBrancheWithDateDf = (expandSubCols(expandedHovedBrancheDf,"periode")
.orderBy(["cvrNummer","gyldigFra"],ascending=[1,1]))
#expandedHovedBrancheWithDateDf.show(truncate=False)
hovedbrancher2015Df = (expandedHovedBrancheWithDateDf
.filter((F.year(F.col("gyldigFra"))==2015))
.withColumn(colName="DayOfYear",col=F.dayofyear(F.col("gyldigFra")))
.groupBy(expandedHovedBrancheWithDateDf.columns)
.max("DayOfYear"))
hovedbrancher2015Df.show()
hovedbrancher2015Cols = hovedbrancher2015Df.columns
prefixedHovedBranch2015Df = (hovedbrancher2015Df
.select([F.col(x).alias("branche"+x) for x in hovedbrancher2015Cols]))
#prefixedHovedBranch2015Df.show()
+---------+-----------+--------------------+--------------------+----------+----------+--------------+
|cvrNummer|branchekode| branchetekst| sidstOpdateret| gyldigFra| gyldigTil|max(DayOfYear)|
+---------+-----------+--------------------+--------------------+----------+----------+--------------+
| 10001382| 467700|Engroshandel med ...|2015-05-08T15:07:...|2015-01-01| null| 1|
| 10002230| 682040|Udlejning af erhv...|2016-01-07T14:13:...|2015-01-01| null| 1|
| 10002273| 682040|Udlejning af erhv...|2015-11-03T13:36:...|2015-04-15|2015-10-23| 105|
| 10009561| 642010|Finansielle holdi...|2015-05-18T09:05:...|2015-01-01| null| 1|
| 10012589| 682040|Udlejning af erhv...|2015-03-05T07:09:...|2015-03-01| null| 60|
| 10012988| 649900|Anden finansiel f...|2015-02-13T16:05:...|2015-02-13| null| 44|
| 10020263| 811000|Kombinerede servi...|2015-05-19T09:42:...|2015-05-19| null| 139|
| 10022967| 749090|Andre liberale, v...|2015-07-28T15:49:...|2015-07-15| null| 196|
| 10026121| 649900|Anden finansiel f...|2015-02-03T12:57:...|2015-01-30| null| 30|
| 10029147| 749090|Andre liberale, v...|2015-01-10T17:05:...|2015-01-12| null| 12|
| 10031567| 464700|Engroshandel med ...|2016-02-22T08:52:...|2015-01-01| null| 1|
| 10031923| 900300| Kunstnerisk skaben|2015-08-30T13:15:...|2015-08-30| null| 242|
| 10038766| 477890|Detailhandel med ...|2014-11-19T10:20:...|2015-01-01| null| 1|
| 10039584| 682040|Udlejning af erhv...|2016-02-01T17:19:...|2015-10-02| null| 275|
| 10040019| 642020|Ikke-finansielle ...|2016-02-29T14:35:...|2015-07-01| null| 182|
| 10042585| 351400|Handel med elektr...|2015-04-23T09:06:...|2015-04-23| null| 113|
| 10043425| 649900|Anden finansiel f...|2015-10-15T11:56:...|2015-01-01| null| 1|
| 10045517| 649900|Anden finansiel f...|2015-08-18T14:45:...|2015-08-18| null| 230|
| 10048761| 236300|Fremstilling af f...|2015-01-09T13:48:...|2015-01-09| null| 9|
| 10054958| 812290|Anden rengøring a...|2015-06-26T10:57:...|2015-01-01| null| 1|
+---------+-----------+--------------------+--------------------+----------+----------+--------------+
only showing top 20 rows
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-62-bc7e521d5708> in <module>()
21 hovedbrancher2015Cols = hovedbrancher2015Df.columns
22
---> 23 prefixedHovedBranch2015Df = (ovedbrancher2015Df
24 .select([F.col(x).alias("branche"+x) for x in hovedbrancher2015Cols]))
25 #prefixedHovedBranch2015Df.show()
NameError: name 'ovedbrancher2015Df' is not defined
In [63]:
#investigate whether a cvrNumber is listed twice in
hovedbrancher2015Df.select("cvrNummer").groupby().count().show()
hovedbrancher2015Df.select("cvrNummer").distinct().groupby().count().show()
+-----+
|count|
+-----+
|96334|
+-----+
+-----+
|count|
+-----+
|92159|
+-----+
In [21]:
#expandedHovedBrancheWithDateDf.select(F.col("cvrNummer")
# ,F.year(F.col("gyldigFra")).alias("aar")
# ,F.col("branchekode")
# ,F.col("branchetekst")).groupBy(F.col("branchekode"),F.col("branchetekst"),F.col("aar")).count().show()
+-----------+--------------------+----+-----+
|branchekode| branchetekst| aar|count|
+-----------+--------------------+----+-----+
| 741020|Kommunikationsdes...|2008| 1779|
| 514610|Engroshandel med ...|2000| 51|
| 741490|Anden virksomheds...|2003| 2909|
| 713400|Udlejning af mask...|2005| 167|
| 851210|Alment praktisere...|1986| 124|
| 912000| Fagforeninger|1986| 24|
| 316290|Fremstilling af a...|2007| 39|
| 139210|Fremstilling af b...|2008| 182|
| 331030|Fremstilling af e...|2000| 4|
| 512200| Rumfart|1985| 12|
| 522120|Parkering og vejh...|2008| 258|
| 711230|Opstilling og lev...|2008| 31|
| 452510| Murerforretninger|1985| 110|
| 515400|Engroshandel med ...|2002| 66|
| 332090|Fremstilling af a...|2002| 15|
| 513900|Ikke-specialisere...|2003| 59|
| 282900|Fremstilling af a...|2008| 152|
| 602200| Taxikørsel|1973| 49|
| 515100|Engroshandel med ...|2003| 32|
| 631100|Databehandling, w...|2003| 3|
+-----------+--------------------+----+-----+
only showing top 20 rows
In [45]:
vaekst2015Cols = vaekst2015Df.columns
branche2015Cols = prefixedHovedBranch2015Df.columns
vaekstWithBrancher2015Df = (vaekst2015Df
.join(prefixedHovedBranch2015Df,vaekst2015Df["cvrNummer"]==prefixedHovedBranch2015Df["branchecvrNummer"],"outer")
)
root
|-- cvrNummer: long (nullable = true)
|-- vaerdi: string (nullable = true)
|-- gyldigFra: string (nullable = true)
|-- gyldigTil: string (nullable = true)
|-- varighed: integer (nullable = true)
|-- CumlativeDiff: double (nullable = true)
|-- branchecvrNummer: long (nullable = true)
|-- branchebranchekode: string (nullable = true)
|-- branchebranchetekst: string (nullable = true)
|-- branchesidstOpdateret: string (nullable = true)
|-- branchegyldigFra: string (nullable = true)
|-- branchegyldigTil: string (nullable = true)
+---------+------------+----------+----------+--------+-------------+----------------+------------------+--------------------+---------------------+----------------+----------------+
|cvrNummer| vaerdi| gyldigFra| gyldigTil|varighed|CumlativeDiff|branchecvrNummer|branchebranchekode| branchebranchetekst|branchesidstOpdateret|branchegyldigFra|branchegyldigTil|
+---------+------------+----------+----------+--------+-------------+----------------+------------------+--------------------+---------------------+----------------+----------------+
| null| null| null| null| null| null| 10194164| 981000|Private husholdni...| 2015-08-20T16:21:...| 2015-04-15| null|
| null| null| null| null| null| null| 12920547| 683110| Ejendomsmæglere mv.| 2015-06-05T08:09:...| 2015-01-01| null|
| null| null| null| null| null| null| 13054533| 682040|Udlejning af erhv...| 2015-09-25T10:23:...| 2015-09-25| null|
| null| null| null| null| null| null| 13545731| 900300| Kunstnerisk skaben| 2015-03-20T11:17:...| 2015-03-10| null|
| null| null| null| null| null| null| 17201948| 649900|Anden finansiel f...| 2015-09-30T17:12:...| 2015-06-01| null|
| null| null| null| null| null| null| 17313088| 702100|Public relations ...| 2015-03-24T16:15:...| 2015-04-01| null|
| null| null| null| null| null| null| 17944673| 256200| Maskinforarbejdning| 2015-07-15T10:15:...| 2015-07-01| null|
| null| null| null| null| null| null| 19337731| 433410| Malerforretninger| 2015-06-10T10:23:...| 2015-07-01| null|
| 20016175|264453099.00|2015-06-17|2017-01-05| 568| 1.14453099E8| null| null| null| null| null| null|
| null| null| null| null| null| null| 20035331| 691000| Juridisk bistand| 2015-02-02T19:05:...| 2015-01-01| null|
| null| null| null| null| null| null| 20427132| 702100|Public relations ...| 2015-11-10T10:08:...| 2015-11-05| null|
| null| null| null| null| null| null| 20820330| 682040|Udlejning af erhv...| 2015-12-16T14:14:...| 2015-10-01| null|
| null| null| null| null| null| null| 21091871| 561010| Restauranter| 2015-10-09T14:10:...| 2015-10-01| null|
| null| null| null| null| null| null| 21609897| 433410| Malerforretninger| 2015-12-15T12:11:...| 2015-10-01| null|
| null| null| null| null| null| null| 21627631| 731110| Reklamebureauer| 2015-07-31T15:07:...| 2015-08-01| null|
| 21734144| 50000.00|2015-06-03|2017-01-05| 582| -85000.0| null| null| null| null| null| null|
| null| null| null| null| null| null| 21847305| 741010|Industriel design...| 2015-05-19T05:07:...| 2015-06-01| null|
| null| null| null| null| null| null| 21943436| 433900|Anden bygningsfær...| 2016-04-24T18:05:...| 2015-07-15| 2016-04-30|
| null| null| null| null| null| null| 21985333| 960400|Aktiviteter vedrø...| 2015-06-07T13:07:...| 2015-06-05| null|
| 25042573| 20000000.00|2015-10-01|2017-01-05| 462| 1.95E7| null| null| null| null| null| null|
+---------+------------+----------+----------+--------+-------------+----------------+------------------+--------------------+---------------------+----------------+----------------+
only showing top 20 rows
In [46]:
vaekstWithBrancher2015Df.show()
+---------+------------+----------+----------+--------+-------------+----------------+------------------+--------------------+---------------------+----------------+----------------+
|cvrNummer| vaerdi| gyldigFra| gyldigTil|varighed|CumlativeDiff|branchecvrNummer|branchebranchekode| branchebranchetekst|branchesidstOpdateret|branchegyldigFra|branchegyldigTil|
+---------+------------+----------+----------+--------+-------------+----------------+------------------+--------------------+---------------------+----------------+----------------+
| null| null| null| null| null| null| 10194164| 981000|Private husholdni...| 2015-08-20T16:21:...| 2015-04-15| null|
| null| null| null| null| null| null| 12920547| 683110| Ejendomsmæglere mv.| 2015-06-05T08:09:...| 2015-01-01| null|
| null| null| null| null| null| null| 13054533| 682040|Udlejning af erhv...| 2015-09-25T10:23:...| 2015-09-25| null|
| null| null| null| null| null| null| 13545731| 900300| Kunstnerisk skaben| 2015-03-20T11:17:...| 2015-03-10| null|
| null| null| null| null| null| null| 17201948| 649900|Anden finansiel f...| 2015-09-30T17:12:...| 2015-06-01| null|
| null| null| null| null| null| null| 17313088| 702100|Public relations ...| 2015-03-24T16:15:...| 2015-04-01| null|
| null| null| null| null| null| null| 17944673| 256200| Maskinforarbejdning| 2015-07-15T10:15:...| 2015-07-01| null|
| null| null| null| null| null| null| 19337731| 433410| Malerforretninger| 2015-06-10T10:23:...| 2015-07-01| null|
| 20016175|264453099.00|2015-06-17|2017-01-05| 568| 1.14453099E8| null| null| null| null| null| null|
| null| null| null| null| null| null| 20035331| 691000| Juridisk bistand| 2015-02-02T19:05:...| 2015-01-01| null|
| null| null| null| null| null| null| 20427132| 702100|Public relations ...| 2015-11-10T10:08:...| 2015-11-05| null|
| null| null| null| null| null| null| 20820330| 682040|Udlejning af erhv...| 2015-12-16T14:14:...| 2015-10-01| null|
| null| null| null| null| null| null| 21091871| 561010| Restauranter| 2015-10-09T14:10:...| 2015-10-01| null|
| null| null| null| null| null| null| 21609897| 433410| Malerforretninger| 2015-12-15T12:11:...| 2015-10-01| null|
| null| null| null| null| null| null| 21627631| 731110| Reklamebureauer| 2015-07-31T15:07:...| 2015-08-01| null|
| 21734144| 50000.00|2015-06-03|2017-01-05| 582| -85000.0| null| null| null| null| null| null|
| null| null| null| null| null| null| 21847305| 741010|Industriel design...| 2015-05-19T05:07:...| 2015-06-01| null|
| null| null| null| null| null| null| 21943436| 433900|Anden bygningsfær...| 2016-04-24T18:05:...| 2015-07-15| 2016-04-30|
| null| null| null| null| null| null| 21985333| 960400|Aktiviteter vedrø...| 2015-06-07T13:07:...| 2015-06-05| null|
| 25042573| 20000000.00|2015-10-01|2017-01-05| 462| 1.95E7| null| null| null| null| null| null|
+---------+------------+----------+----------+--------+-------------+----------------+------------------+--------------------+---------------------+----------------+----------------+
only showing top 20 rows
In [ ]:
Content source: mssalvador/notebooks
Similar notebooks: