In [1]:
import numpy as np
import sys

from pyspark import SQLContext
from pyspark import SparkContext
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.functions import  udf, mean
from re import findall, sub

In [2]:
#spark = SparkSession(sc).builder.master("local[*]").appName("TestingCvr").getOrCreate()
conf = sc.getConf()
conf.setAppName("ImportMetaCVR")
print(conf.getAll())
sqlContext = SQLContext(sc)


[('hive.metastore.warehouse.dir', 'file:/home/svanhmic/workspace/Python/Erhvervs/src/notebooks/cvr/spark-warehouse'), ('spark.app.name', 'ImportMetaCVR'), ('spark.sql.catalogImplementation', 'hive'), ('spark.app.id', 'local-1483963678924'), ('spark.rdd.compress', 'True'), ('spark.driver.host', '10.52.1.5'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.driver.port', '39795'), ('spark.executor.id', 'driver'), ('spark.submit.deployMode', 'client'), ('spark.driver.memory', '6G')]

In [ ]:


In [4]:
metaDataLink = "/home/svanhmic/workspace/Python/Erhvervs/data/cdata/virksomhedersMetadata.json"
metaDataDf = sqlContext.read.json(metaDataLink)

In [5]:
metaDataDf.show()


+-------+------------+------------------+---------+---------+----------------+--------------------+-------------+----------------------+--------------------+
|  Index|antalAnsatte|brancheAnsvarskode|cvrnummer|nPenheder|reklamebeskyttet|     sammensatStatus|stiftelsesAar|virksomhedsBeskrivelse|   virksomhedsstatus|
+-------+------------+------------------+---------+---------+----------------+--------------------+-------------+----------------------+--------------------+
|1232645|           0|              None| 26565537|        1|               1|               aktiv|         2002|  Enkeltmandsvirkso...|                    |
|1232646|        null|              None| 34642362|        1|               1|               aktiv|         2012|    Frivillig forening|                    |
|1232647|        null|              None| 37645311|        1|               1|              normal|         2016|                  None|              normal|
|1232648|          20|              None| 50743128|        1|               0|              ophørt|         1975|       Interessentskab|             slettet|
|1232649|        null|              None| 37299499|        1|               0|              normal|         2015|        Anpartsselskab|              normal|
|1232650|        null|              None| 31180368|        1|               0|              normal|         2008|        Anpartsselskab|              normal|
|1232651|           0|              None| 31265584|        1|               0|              normal|         2008|        Anpartsselskab|              normal|
|1232652|        null|              None| 35636889|        2|               1|              normal|         2014|        Anpartsselskab|              normal|
|1232653|           0|              None| 25357264|        1|               0|               aktiv|         2000|  Enkeltmandsvirkso...|                    |
|1232654|        null|              None| 37563188|        1|               1|               aktiv|         2016|  Enkeltmandsvirkso...|                    |
|1232655|           0|              None| 25286499|        1|               1|               aktiv|         2000|  Enkeltmandsvirkso...|                    |
|1232656|        null|              None| 37562254|        1|               1|              normal|         2016|                  None|              normal|
|1232657|        null|              None| 36204370|        1|               1|              ophørt|         2015|       Interessentskab|                    |
|1232658|           1|              None| 31329949|        1|               0|              normal|         2008|        Anpartsselskab|              normal|
|1232659|           0|              None| 31330297|        1|               0|              normal|         2008|  Filial af udenlan...|              normal|
|1232660|        null|              None| 32257194|        1|               1|               aktiv|         2009|  Anden udenlandsk ...|                    |
|1232661|           0|              None| 26217776|        1|               1|             fremtid|         2001|  Enkeltmandsvirkso...|                    |
|1232662|           0|              None| 15428406|        1|               0|               aktiv|         1991|  Enkeltmandsvirkso...|                    |
|1232663|        null|              None| 31254000|        1|               0|undertvangsopløsning|         2008|        Anpartsselskab|undertvangsopløsning|
|1232664|        null|              None| 30520858|        1|               0|              normal|         2008|        Anpartsselskab|              normal|
+-------+------------+------------------+---------+---------+----------------+--------------------+-------------+----------------------+--------------------+
only showing top 20 rows


In [ ]: