In [1]:
import numpy as np
import sys
from pyspark import SQLContext
from pyspark import SparkContext
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.functions import udf, mean
from re import findall, sub
In [2]:
#spark = SparkSession(sc).builder.master("local[*]").appName("TestingCvr").getOrCreate()
conf = sc.getConf()
conf.setAppName("ImportMetaCVR")
print(conf.getAll())
sqlContext = SQLContext(sc)
[('hive.metastore.warehouse.dir', 'file:/home/svanhmic/workspace/Python/Erhvervs/src/notebooks/cvr/spark-warehouse'), ('spark.app.name', 'ImportMetaCVR'), ('spark.sql.catalogImplementation', 'hive'), ('spark.app.id', 'local-1483963678924'), ('spark.rdd.compress', 'True'), ('spark.driver.host', '10.52.1.5'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.driver.port', '39795'), ('spark.executor.id', 'driver'), ('spark.submit.deployMode', 'client'), ('spark.driver.memory', '6G')]
In [ ]:
In [4]:
metaDataLink = "/home/svanhmic/workspace/Python/Erhvervs/data/cdata/virksomhedersMetadata.json"
metaDataDf = sqlContext.read.json(metaDataLink)
In [5]:
metaDataDf.show()
+-------+------------+------------------+---------+---------+----------------+--------------------+-------------+----------------------+--------------------+
| Index|antalAnsatte|brancheAnsvarskode|cvrnummer|nPenheder|reklamebeskyttet| sammensatStatus|stiftelsesAar|virksomhedsBeskrivelse| virksomhedsstatus|
+-------+------------+------------------+---------+---------+----------------+--------------------+-------------+----------------------+--------------------+
|1232645| 0| None| 26565537| 1| 1| aktiv| 2002| Enkeltmandsvirkso...| |
|1232646| null| None| 34642362| 1| 1| aktiv| 2012| Frivillig forening| |
|1232647| null| None| 37645311| 1| 1| normal| 2016| None| normal|
|1232648| 20| None| 50743128| 1| 0| ophørt| 1975| Interessentskab| slettet|
|1232649| null| None| 37299499| 1| 0| normal| 2015| Anpartsselskab| normal|
|1232650| null| None| 31180368| 1| 0| normal| 2008| Anpartsselskab| normal|
|1232651| 0| None| 31265584| 1| 0| normal| 2008| Anpartsselskab| normal|
|1232652| null| None| 35636889| 2| 1| normal| 2014| Anpartsselskab| normal|
|1232653| 0| None| 25357264| 1| 0| aktiv| 2000| Enkeltmandsvirkso...| |
|1232654| null| None| 37563188| 1| 1| aktiv| 2016| Enkeltmandsvirkso...| |
|1232655| 0| None| 25286499| 1| 1| aktiv| 2000| Enkeltmandsvirkso...| |
|1232656| null| None| 37562254| 1| 1| normal| 2016| None| normal|
|1232657| null| None| 36204370| 1| 1| ophørt| 2015| Interessentskab| |
|1232658| 1| None| 31329949| 1| 0| normal| 2008| Anpartsselskab| normal|
|1232659| 0| None| 31330297| 1| 0| normal| 2008| Filial af udenlan...| normal|
|1232660| null| None| 32257194| 1| 1| aktiv| 2009| Anden udenlandsk ...| |
|1232661| 0| None| 26217776| 1| 1| fremtid| 2001| Enkeltmandsvirkso...| |
|1232662| 0| None| 15428406| 1| 0| aktiv| 1991| Enkeltmandsvirkso...| |
|1232663| null| None| 31254000| 1| 0|undertvangsopløsning| 2008| Anpartsselskab|undertvangsopløsning|
|1232664| null| None| 30520858| 1| 0| normal| 2008| Anpartsselskab| normal|
+-------+------------+------------------+---------+---------+----------------+--------------------+-------------+----------------------+--------------------+
only showing top 20 rows
In [ ]:
Content source: mssalvador/notebooks
Similar notebooks: