In [1]:
SQLContext.newSession(sqlContext)
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler,StandardScaler,RFormula
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.linalg import VectorUDT,Vectors
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import Window
from pyspark.ml import Pipeline


import re
from tabulate import tabulate
import random
import sys
import numpy as np

In [2]:
#import data and rename bad name rank into vaerdiSlope
df = sqlContext.read.parquet("/home/svanhmic/workspace/Python/Erhvervs/data/cdata/featureDataCvr")
df.select(["cvrNummer"])
rankCols = [re.sub(pattern="rank_",repl="vaerdiSlope_",string=i) for i in df.columns ]
renamedDf = (df.withColumn(colName="reklamebeskyttet",col=F.col("reklamebeskyttet").cast("integer"))
             .select([F.col(val).alias(rankCols[idx]) for idx,val in enumerate(df.columns)])
             )
renamedDf.show()


+---------+--------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------------+-------------------+-------------+-------------+-------------+-------------+-------------+----------------+
|cvrNummer|  status|label|AarsVaerk_1|AarsVaerk_2|AarsVaerk_3|AarsVaerk_4|AarsVaerk_5|AarsVaerk_6|AarsVaerk_7|AarsVaerk_8|AarsVaerk_9|AarsVaerk_10|AarsVaerk_11|AarsVaerk_12|AarsVaerk_13|AarsVaerk_14|AarsVaerk_15|medArb_1|medArb_2|medArb_3|medArb_4|medArb_5|medArb_6|medArb_7|medArb_8|medArb_9|medArb_10|medArb_11|medArb_12|medArb_13|medArb_14|medArb_15|avgVarighed|totalAabneEnheder|totalLukketEnheder|      vaerdiSlope_1|      vaerdiSlope_2|vaerdiSlope_3|vaerdiSlope_4|vaerdiSlope_5|vaerdiSlope_6|vaerdiSlope_7|reklamebeskyttet|
+---------+--------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------------+-------------------+-------------+-------------+-------------+-------------+-------------+----------------+
| 20172800|[NORMAL]|    1|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|        20.0|        50.0|        20.0|        20.0|        20.0|        20.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|     50.0|     50.0|     20.0|     20.0|     20.0|     20.0|        7.0|                1|                 1| -4497.751124437781|               null|         null|         null|         null|         null|         null|               0|
| 14049800|[NORMAL]|    1|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|        10.0|        10.0|        10.0|        10.0|        10.0|        10.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|     20.0|     20.0|     20.0|     10.0|     10.0|     10.0|       null|                0|                 1|  220.4585537918871| 17.921146953405017|         null|         null|         null|         null|         null|               0|
| 19298000|[NORMAL]|    1|        1.0|        1.0|        1.0|        1.0|        2.0|        2.0|        1.0|        1.0|        2.0|         2.0|         2.0|         5.0|         5.0|         2.0|         2.0|     1.0|     1.0|     1.0|     2.0|     2.0|     2.0|     2.0|     2.0|     2.0|      2.0|      5.0|      5.0|      5.0|      5.0|      5.0|       null|                0|                 1|-108.41507485802788|-126.19888944977284|         null|         null|         null|         null|         null|               1|
| 21855200|[NORMAL]|    1|        1.0|       10.0|       10.0|       10.0|       10.0|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     2.0|    10.0|    10.0|    10.0|    10.0|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|                0|                 1| 10.933741526350317|               null|         null|         null|         null|         null|         null|               0|
| 14786600|[NORMAL]|    1|        0.0|        0.0|        1.0|        1.0|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     1.0|     1.0|     0.0|     0.0|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|                0|                 1|               null|               null|         null|         null|         null|         null|         null|               0|
| 32556000|[NORMAL]|    1|        2.0|        2.0|        1.0|       null|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|    10.0|    10.0|     5.0|    null|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|                0|                 1|               null|               null|         null|         null|         null|         null|         null|               0|
| 20308400|[NORMAL]|    1|        1.0|        0.0|        0.0|        0.0|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     0.0|     1.0|     1.0|     1.0|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|                0|                 1|               null|               null|         null|         null|         null|         null|         null|               0|
| 34455600|[NORMAL]|    1|        1.0|       null|       null|       null|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     1.0|    null|    null|    null|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|                0|                 1|               null|               null|         null|         null|         null|         null|         null|               0|
| 10944600|[NORMAL]|    1|        2.0|        2.0|        2.0|        2.0|        2.0|        5.0|        5.0|        5.0|        5.0|        10.0|        10.0|        20.0|        20.0|        10.0|        10.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|      5.0|     10.0|     20.0|     20.0|     20.0|     10.0|       null|                0|                 1|  83.05647840531562|   96.3275135460566|         null|         null|         null|         null|         null|               0|
| 13009600|[NORMAL]|    1|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         5.0|         5.0|         5.0|         5.0|         5.0|    10.0|     5.0|    10.0|    10.0|    10.0|     5.0|     5.0|    10.0|    10.0|      5.0|      5.0|     10.0|      5.0|     10.0|      5.0|       null|                0|                 1|    761.03500761035|  76.27765064836004|         null|         null|         null|         null|         null|               1|
| 27585000|[NORMAL]|    1|        1.0|        1.0|        1.0|       null|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     1.0|     1.0|     1.0|    null|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|                0|                 1|               null|               null|         null|         null|         null|         null|         null|               0|
| 13341400|[NORMAL]|    1|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|        5.0|       10.0|        10.0|        10.0|        10.0|         5.0|         5.0|         5.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|     10.0|     10.0|     10.0|     10.0|     10.0|     10.0|       null|                0|                 1| 158.73015873015873|  319.3033381712627|         null|         null|         null|         null|         null|               0|
| 14876200|[NORMAL]|    1|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|         1.0|         1.0|         1.0|         1.0|         1.0|        null|     1.0|     1.0|     0.0|     2.0|     1.0|     2.0|     2.0|     2.0|     1.0|      1.0|      1.0|      1.0|      1.0|      2.0|     null|       null|                0|                 1|   21.1864406779661|               null|         null|         null|         null|         null|         null|               0|
| 14290400|[NORMAL]|    1|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         5.0|         5.0|         5.0|         5.0|         5.0|     5.0|     5.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|     10.0|     10.0|     10.0|     10.0|     10.0|     10.0|       null|                0|                 1|               null|               null|         null|         null|         null|         null|         null|               0|
| 25108000|[NORMAL]|    1|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         2.0|         2.0|        null|        null|        null|     5.0|     5.0|     5.0|    10.0|     5.0|     5.0|     5.0|     5.0|     5.0|      5.0|      5.0|      5.0|     null|     null|     null|       null|                0|                 1|               null|               null|         null|         null|         null|         null|         null|               0|
| 16981400|[NORMAL]|    1|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         5.0|         5.0|         5.0|         5.0|         5.0|     5.0|     5.0|     5.0|     5.0|     5.0|    10.0|     5.0|     5.0|     5.0|      5.0|      5.0|      5.0|      5.0|      5.0|      5.0|       null|                0|                 1|               null|               null|         null|         null|         null|         null|         null|               0|
| 12910800|[NORMAL]|    1|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|         1.0|         1.0|         1.0|         1.0|         1.0|         1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     2.0|      2.0|      2.0|      2.0|      2.0|      2.0|      2.0|       null|                0|                 1|  42.71982912068352|               null|         null|         null|         null|         null|         null|               0|
| 21827800|[NORMAL]|    1|       20.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|        10.0|        10.0|         5.0|         5.0|        null|        null|    20.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|     10.0|     10.0|      5.0|     10.0|     null|     null|       null|                0|                 2|               null|               null|         null|         null|         null|         null|         null|               0|
| 25548000|[NORMAL]|    1|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|       null|       null|       null|        null|        null|        null|        null|        null|        null|    10.0|    10.0|    10.0|     5.0|    10.0|    10.0|    null|    null|    null|     null|     null|     null|     null|     null|     null|      986.5|                2|                 1|               null|               null|         null|         null|         null|         null|         null|               0|
| 33065000|[NORMAL]|    1|        2.0|        5.0|       null|       null|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     0.0|    10.0|    null|    null|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|                0|                 1|               null|               null|         null|         null|         null|         null|         null|               0|
+---------+--------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------------+-------------------+-------------+-------------+-------------+-------------+-------------+----------------+
only showing top 20 rows


In [ ]:


In [ ]:


In [9]:
strs = ""
excludedCols = ["medArb_"+str(i) for i in range(1,16)]+["cvrNummer","label","status"]
for i in renamedDf.columns:
    if i not in excludedCols:
        strs += i+" + "

#excludedCols    
imputedDf = renamedDf.fillna(value=0.0)
formula = RFormula(formula="label ~ "+strs[:-3],labelCol="label")


formula.fit(imputedDf.drop("status")).transform(imputedDf.drop("status")).show()

lr = LogisticRegression()

pipeline = Pipeline(stages=[formula,lr])


+---------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------------+-------------------+-------------+-------------+-------------+-------------+-------------+----------------+--------------------+
|cvrNummer|label|AarsVaerk_1|AarsVaerk_2|AarsVaerk_3|AarsVaerk_4|AarsVaerk_5|AarsVaerk_6|AarsVaerk_7|AarsVaerk_8|AarsVaerk_9|AarsVaerk_10|AarsVaerk_11|AarsVaerk_12|AarsVaerk_13|AarsVaerk_14|AarsVaerk_15|medArb_1|medArb_2|medArb_3|medArb_4|medArb_5|medArb_6|medArb_7|medArb_8|medArb_9|medArb_10|medArb_11|medArb_12|medArb_13|medArb_14|medArb_15|avgVarighed|totalAabneEnheder|totalLukketEnheder|      vaerdiSlope_1|      vaerdiSlope_2|vaerdiSlope_3|vaerdiSlope_4|vaerdiSlope_5|vaerdiSlope_6|vaerdiSlope_7|reklamebeskyttet|            features|
+---------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------------+-------------------+-------------+-------------+-------------+-------------+-------------+----------------+--------------------+
| 20172800|    1|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|        20.0|        50.0|        20.0|        20.0|        20.0|        20.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|     50.0|     50.0|     20.0|     20.0|     20.0|     20.0|        7.0|                1|                 1| -4497.751124437781|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|[50.0,50.0,50.0,5...|
| 14049800|    1|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|        10.0|        10.0|        10.0|        10.0|        10.0|        10.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|     20.0|     20.0|     20.0|     10.0|     10.0|     10.0|        0.0|                0|                 1|  220.4585537918871| 17.921146953405017|          0.0|          0.0|          0.0|          0.0|          0.0|               0|[20.0,20.0,20.0,2...|
| 19298000|    1|        1.0|        1.0|        1.0|        1.0|        2.0|        2.0|        1.0|        1.0|        2.0|         2.0|         2.0|         5.0|         5.0|         2.0|         2.0|     1.0|     1.0|     1.0|     2.0|     2.0|     2.0|     2.0|     2.0|     2.0|      2.0|      5.0|      5.0|      5.0|      5.0|      5.0|        0.0|                0|                 1|-108.41507485802788|-126.19888944977284|          0.0|          0.0|          0.0|          0.0|          0.0|               1|[1.0,1.0,1.0,1.0,...|
| 21855200|    1|        1.0|       10.0|       10.0|       10.0|       10.0|        0.0|        0.0|        0.0|        0.0|         0.0|         0.0|         0.0|         0.0|         0.0|         0.0|     2.0|    10.0|    10.0|    10.0|    10.0|     0.0|     0.0|     0.0|     0.0|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|        0.0|                0|                 1| 10.933741526350317|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,1,2,3,4,17...|
| 14786600|    1|        0.0|        0.0|        1.0|        1.0|        0.0|        0.0|        0.0|        0.0|        0.0|         0.0|         0.0|         0.0|         0.0|         0.0|         0.0|     1.0|     1.0|     0.0|     0.0|     0.0|     0.0|     0.0|     0.0|     0.0|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|        0.0|                0|                 1|                0.0|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[2,3,17],[1.0...|
| 32556000|    1|        2.0|        2.0|        1.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|         0.0|         0.0|         0.0|         0.0|         0.0|         0.0|    10.0|    10.0|     5.0|     0.0|     0.0|     0.0|     0.0|     0.0|     0.0|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|        0.0|                0|                 1|                0.0|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,1,2,17],[2...|
| 20308400|    1|        1.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|         0.0|         0.0|         0.0|         0.0|         0.0|         0.0|     0.0|     1.0|     1.0|     1.0|     0.0|     0.0|     0.0|     0.0|     0.0|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|        0.0|                0|                 1|                0.0|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,17],[1.0,1...|
| 34455600|    1|        1.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|         0.0|         0.0|         0.0|         0.0|         0.0|         0.0|     1.0|     0.0|     0.0|     0.0|     0.0|     0.0|     0.0|     0.0|     0.0|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|        0.0|                0|                 1|                0.0|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,17],[1.0,1...|
| 10944600|    1|        2.0|        2.0|        2.0|        2.0|        2.0|        5.0|        5.0|        5.0|        5.0|        10.0|        10.0|        20.0|        20.0|        10.0|        10.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|      5.0|     10.0|     20.0|     20.0|     20.0|     10.0|        0.0|                0|                 1|  83.05647840531562|   96.3275135460566|          0.0|          0.0|          0.0|          0.0|          0.0|               0|[2.0,2.0,2.0,2.0,...|
| 13009600|    1|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         5.0|         5.0|         5.0|         5.0|         5.0|    10.0|     5.0|    10.0|    10.0|    10.0|     5.0|     5.0|    10.0|    10.0|      5.0|      5.0|     10.0|      5.0|     10.0|      5.0|        0.0|                0|                 1|    761.03500761035|  76.27765064836004|          0.0|          0.0|          0.0|          0.0|          0.0|               1|[5.0,5.0,5.0,5.0,...|
| 27585000|    1|        1.0|        1.0|        1.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|         0.0|         0.0|         0.0|         0.0|         0.0|         0.0|     1.0|     1.0|     1.0|     0.0|     0.0|     0.0|     0.0|     0.0|     0.0|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|        0.0|                0|                 1|                0.0|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,1,2,17],[1...|
| 13341400|    1|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|        5.0|       10.0|        10.0|        10.0|        10.0|         5.0|         5.0|         5.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|     10.0|     10.0|     10.0|     10.0|     10.0|     10.0|        0.0|                0|                 1| 158.73015873015873|  319.3033381712627|          0.0|          0.0|          0.0|          0.0|          0.0|               0|[10.0,10.0,10.0,1...|
| 14876200|    1|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|         1.0|         1.0|         1.0|         1.0|         1.0|         0.0|     1.0|     1.0|     0.0|     2.0|     1.0|     2.0|     2.0|     2.0|     1.0|      1.0|      1.0|      1.0|      1.0|      2.0|      0.0|        0.0|                0|                 1|   21.1864406779661|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,1,2,3,4,5,...|
| 14290400|    1|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         5.0|         5.0|         5.0|         5.0|         5.0|     5.0|     5.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|     10.0|     10.0|     10.0|     10.0|     10.0|     10.0|        0.0|                0|                 1|                0.0|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,1,2,3,4,5,...|
| 25108000|    1|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         2.0|         2.0|         0.0|         0.0|         0.0|     5.0|     5.0|     5.0|    10.0|     5.0|     5.0|     5.0|     5.0|     5.0|      5.0|      5.0|      5.0|      0.0|      0.0|      0.0|        0.0|                0|                 1|                0.0|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,1,2,3,4,5,...|
| 16981400|    1|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         5.0|         5.0|         5.0|         5.0|         5.0|     5.0|     5.0|     5.0|     5.0|     5.0|    10.0|     5.0|     5.0|     5.0|      5.0|      5.0|      5.0|      5.0|      5.0|      5.0|        0.0|                0|                 1|                0.0|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,1,2,3,4,5,...|
| 12910800|    1|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|         1.0|         1.0|         1.0|         1.0|         1.0|         1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     2.0|      2.0|      2.0|      2.0|      2.0|      2.0|      2.0|        0.0|                0|                 1|  42.71982912068352|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|[1.0,1.0,1.0,1.0,...|
| 21827800|    1|       20.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|        10.0|        10.0|         5.0|         5.0|         0.0|         0.0|    20.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|     10.0|     10.0|      5.0|     10.0|      0.0|      0.0|        0.0|                0|                 2|                0.0|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,1,2,3,4,5,...|
| 25548000|    1|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        0.0|        0.0|        0.0|         0.0|         0.0|         0.0|         0.0|         0.0|         0.0|    10.0|    10.0|    10.0|     5.0|    10.0|    10.0|     0.0|     0.0|     0.0|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|      986.5|                2|                 1|                0.0|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,1,2,3,4,5,...|
| 33065000|    1|        2.0|        5.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|        0.0|         0.0|         0.0|         0.0|         0.0|         0.0|         0.0|     0.0|    10.0|     0.0|     0.0|     0.0|     0.0|     0.0|     0.0|     0.0|      0.0|      0.0|      0.0|      0.0|      0.0|      0.0|        0.0|                0|                 1|                0.0|                0.0|          0.0|          0.0|          0.0|          0.0|          0.0|               0|(26,[0,1,17],[2.0...|
+---------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------------+-------------------+-------------+-------------+-------------+-------------+-------------+----------------+--------------------+
only showing top 20 rows


In [ ]:
cols [i for i in renamedDf.columnsl if i not in excludedCols]

model = pipeline.fit(renamedDf.select(*cols))

In [ ]:


In [3]:
#check null values.
descriptionCVR = renamedDf.describe()
descriptionCVR.select("summary").show()
descriptionCVR.filter(F.col("summary")=="count").show()


+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

+-------+---------+------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+----------------+
|summary|cvrNummer| label|AarsVaerk_1|AarsVaerk_2|AarsVaerk_3|AarsVaerk_4|AarsVaerk_5|AarsVaerk_6|AarsVaerk_7|AarsVaerk_8|AarsVaerk_9|AarsVaerk_10|AarsVaerk_11|AarsVaerk_12|AarsVaerk_13|AarsVaerk_14|AarsVaerk_15|medArb_1|medArb_2|medArb_3|medArb_4|medArb_5|medArb_6|medArb_7|medArb_8|medArb_9|medArb_10|medArb_11|medArb_12|medArb_13|medArb_14|medArb_15|avgVarighed|totalAabneEnheder|totalLukketEnheder|vaerdiSlope_1|vaerdiSlope_2|vaerdiSlope_3|vaerdiSlope_4|vaerdiSlope_5|vaerdiSlope_6|vaerdiSlope_7|reklamebeskyttet|
+-------+---------+------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+----------------+
|  count|   317827|317827|     317827|     269143|     210742|     174464|     143649|     118501|      99374|      84004|      69626|       59566|       50753|       43772|       36725|       28263|       22737|  317827|  269143|  210742|  174464|  143649|  118501|   99374|   84004|   69626|    59566|    50753|    43772|    36725|    28263|    22737|     234873|           317827|            317827|       121067|        45698|        19630|         9925|         5622|         3499|         2357|          317827|
+-------+---------+------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+----------------+


In [4]:
#check mean and stddev
descriptionCVR.filter((F.col("summary") =="mean") | (F.col("summary") =="stddev")).show()


+-------+-------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+-----------------+-------------------+------------------+------------------+------------------+-------------------+
|summary|          cvrNummer|             label|       AarsVaerk_1|       AarsVaerk_2|      AarsVaerk_3|      AarsVaerk_4|       AarsVaerk_5|       AarsVaerk_6|      AarsVaerk_7|       AarsVaerk_8|       AarsVaerk_9|      AarsVaerk_10|      AarsVaerk_11|      AarsVaerk_12|      AarsVaerk_13|      AarsVaerk_14|      AarsVaerk_15|          medArb_1|         medArb_2|         medArb_3|         medArb_4|          medArb_5|          medArb_6|          medArb_7|          medArb_8|          medArb_9|         medArb_10|         medArb_11|        medArb_12|         medArb_13|         medArb_14|        medArb_15|       avgVarighed| totalAabneEnheder| totalLukketEnheder|    vaerdiSlope_1|     vaerdiSlope_2|    vaerdiSlope_3|      vaerdiSlope_4|     vaerdiSlope_5|     vaerdiSlope_6|     vaerdiSlope_7|   reklamebeskyttet|
+-------+-------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+-----------------+-------------------+------------------+------------------+------------------+-------------------+
|   mean|2.992587977176892E7|0.6770318443681625| 4.963124592938926| 6.599220488736471|7.883416689601503|9.083146093176817| 9.771213165424054|10.298917308714694|10.77020146114678|11.333662682729393|11.979418607991267|12.565540744720142|13.190747344984533|13.685712327515306|14.092008168822328|15.306301524961965|15.918722786647315| 5.503519210136332|8.213265810368465|9.818617076804813|11.16323711482025|12.375860604668324|12.991983190015274|13.533087125405036|13.999154802152278|14.809654439433546|15.498606587650674| 16.10517604870648|16.86322306497304|17.443267528931244|18.770017337154584|19.24664643532568|3104.0394283223163|1.1975634543320737|0.42037334776466445|9367.222088225697|14773.935940526042|37270.54804289701|  534722.7367923224| 69479.75111113006| 79309.96883183606|246815.61845150983|0.05565606446274231|
| stddev|1.595872029695645E7|0.4815407404508707|28.219409782015937|31.349174803319965|36.38636236195736|40.84287153372559|43.053166612742295| 43.50716762822306|44.69115651768418| 46.24885171030284| 48.03721081702004|49.663482624596746|   50.952745810432|51.757992157941516| 51.79592907581495|57.952550432313735|59.896979275308205|31.469466071930402| 35.7271436211818|40.94583736584511| 45.5828092120186| 48.68024770176678| 49.50427185113245|50.816539411344706| 52.01619572491007|54.732773025784745|  56.2466055958348|56.478429021321915|58.18250553629226| 59.22942233650709| 64.22850620145455|64.30084551221002| 2826.832782887831| 4.297482547820878|  2.316930888416194|1104747.513966895|485532.34538503713|947528.7880333586|4.535843433554792E7|1192985.3794908465|1020811.2995945786| 4224380.725125994|0.22925669525652828|
+-------+-------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+-----------------+-------------------+------------------+------------------+------------------+-------------------+


In [5]:
windowSpecRank =(Window.partitionBy(F.col("cvrNummer"))).orderBy(F.col("gyldigFra").desc())

groupCols = ["cvrNummer","vaerdi"]

companyNameDf = (sqlContext
                 .read
                 .parquet("/home/svanhmic/workspace/Python/Erhvervs/data/cdata/"+"companyCvrData")
                 .withColumn(colName="rank",col=F.rank().over(windowSpecRank))
                 .filter((F.col("rank")==1) & (F.col("sekvensnr")==0))
                 .select([F.col(i) for i in groupCols])
                 .withColumnRenamed(existing="vaerdi",new="navn")
                 .orderBy(F.col("cvrNummer"))
                 .cache()
                )
companyNameDf.show(2)


+---------+--------------------+
|cvrNummer|                navn|
+---------+--------------------+
| 10000009|              YELLOW|
| 10000025|WATERFRONT CONNEC...|
+---------+--------------------+
only showing top 2 rows


In [6]:
#take ln(x+1) of features

labelCols = ["cvrNummer","label","status"]
logFeatCols = [i for i in renamedDf.columns if i not in labelCols]
#print(logFeatCols)
mininum = descriptionCVR.filter(F.col("summary")=="min").collect()[0]
#print(mininum)


logDf = (renamedDf
         .select([F.col("cvrNummer"),F.col("label")]+[F.log1p(F.col(i)-F.lit(mininum[i])).alias(i) for i in logFeatCols])
         .na
         .fill(0.0,logFeatCols)
         
        )
#logDf.show(2)


#First convert features to vetor
toDenseUDf = F.udf(lambda x: Vectors.dense(x.toArray()),VectorUDT())
vectorizer = VectorAssembler(inputCols=logFeatCols,outputCol="features")

rawVectorDataDf = (vectorizer.transform(renamedDf                                                                               
                                        .join(companyNameDf,(companyNameDf["cvrNummer"]==renamedDf["cvrNummer"]),"inner")
                                        .drop(companyNameDf["cvrNummer"])
                                        #.select(*logColsSelected) 
                                        .na
                                        .fill(0.0,logFeatCols)
                                        .distinct()
                                       )
                   .select(["navn"]+labelCols+[toDenseUDf(vectorizer.getOutputCol()).alias(vectorizer.getOutputCol())])
                  )

standardScale = StandardScaler(withMean=True,withStd=True,inputCol=vectorizer.getOutputCol(),outputCol="scaledFeatures")
standardScaleModel = standardScale.fit(rawVectorDataDf)
scaledFeaturesDf = (standardScaleModel
                    .transform(rawVectorDataDf)
                    .drop("features")
                    .withColumnRenamed(existing="scaledFeatures",new="features")
                   )

scaledFeaturesDf.show()


+--------------------+---------+-----+--------------------+--------------------+
|                navn|cvrNummer|label|              status|            features|
+--------------------+---------+-----+--------------------+--------------------+
|   EMPAKA KARTONNAGE| 10016533|    1|            [NORMAL]|[0.12721405809989...|
|      DET GAMLE GODS| 10016606|    1|            [NORMAL]|[-0.1483813281958...|
|               DIXEN| 10018064|    0|[OPLØST EFTER KON...|[-0.1483813281958...|
|        TRELBORG VVS| 10063760|    1|            [NORMAL]|[-0.1483813281958...|
|            CYBERSUN| 10065917|    0|[OPLØST EFTER KON...|[-0.0258944898421...|
|                HME2| 10080207|    1|            [NORMAL]|[-0.1483813281958...|
|TØMRERFIRMAET HER...| 10082528|    1|            [NORMAL]|[-0.1177596186074...|
|    MØRKHOLT VINDUER| 10096227|    1|            [NORMAL]|[-0.1483813281958...|
|                KISØ| 10108993|    1|            [NORMAL]|[-0.1177596186074...|
|               C-CUT| 10117267|    1|[OPLØST EFTER ERK...|[-0.1483813281958...|
|          HLS INVEST| 10120829|    0|[OPLØST EFTER KON...|[-0.1483813281958...|
|       ELKA RAINWEAR| 10138698|    1|            [NORMAL]|[0.12721405809989...|
|              BØMLER| 10142539|    0|[OPLØST EFTER KON...|[-0.1483813281958...|
|     NORDISKE MEDIER| 10150825|    1|            [NORMAL]|[-0.1177596186074...|
|  SBR POUL-ERIK KJÆR| 10178509|    0|[OPLØST EFTER KON...|[-0.1483813281958...|
|         SCAN ORIENT| 10228476|    1|[TVANGSOPLØST, UN...|[-0.1483813281958...|
|           AQUA-NORD| 10310989|    1|[OPLØST EFTER FRI...|[-0.1177596186074...|
|        IKAST ETIKET| 10402980|    1|            [NORMAL]|[-0.1177596186074...|
|BRDR. JØRGENSEN C...| 10518431|    1|            [NORMAL]|[0.43343115398404...|
|         AXEL MADSEN| 10553490|    1|            [NORMAL]|[-0.0258944898421...|
+--------------------+---------+-----+--------------------+--------------------+
only showing top 20 rows


In [14]:
#put them into a feature vecto
vectorizedTestDf = scaledFeaturesDf.filter(F.col("label") <= 1).sampleBy("label", fractions={0: 0.2, 1: 0.2}, seed=42)
vectorizedTestDf.groupBy("label").count().show()

scaledCvrDf = scaledFeaturesDf.select(F.col("cvrNummer"))
cvrTestDf = vectorizedTestDf.select("cvrNummer")
cvrTrainDf = scaledCvrDf.subtract(cvrTestDf) #take the other partion as training set

vectorizedTrainDf = (scaledFeaturesDf
                     .filter(F.col("label") <= 1)
                     .join(cvrTrainDf,(scaledFeaturesDf["cvrNummer"] == cvrTrainDf["cvrNummer"]),"inner")
                     .drop(cvrTrainDf["cvrNummer"])
                    )
vectorizedTrainDf.groupBy("label").count().show()
print("Number of data points: "+str(scaledFeaturesDf.count()))
print("Number of data points train: "+str(vectorizedTrainDf.select("cvrNummer").count()))
print("Number of data points test: "+str(vectorizedTestDf.select("cvrNummer").count()))
#vectorizedTrainDf.printSchema()
#print(vectorizedTrainDf.first())


+-----+-----+
|label|count|
+-----+-----+
|    0| 6078|
|    1|25756|
+-----+-----+

+-----+------+
|label| count|
+-----+------+
|    0| 24465|
|    1|103472|
+-----+------+

Number of data points: 160648
Number of data points train: 127937
Number of data points test: 31834

In [15]:
vectorizedTrainDf.show()


+--------------------+-----+--------------------+--------------------+---------+
|                navn|label|              status|            features|cvrNummer|
+--------------------+-----+--------------------+--------------------+---------+
|SKYTTENS HANDEL O...|    1|[OPLØST EFTER FUS...|[-0.1483813281958...| 10019052|
|            DIKI.NET|    0|[OPLØST EFTER KON...|[-0.1483813281958...| 10026113|
|                CTEK|    1|[OPLØST EFTER FUS...|[-0.1177596186074...| 10040523|
|      VG ENTREPRENØR|    1|            [NORMAL]|[0.43343115398404...| 10057426|
|NORDBYENS OLIEFYR...|    1|            [NORMAL]|[-0.1483813281958...| 10089514|
|EXPRESS LABELLING...|    1|[TVANGSOPLØST, UN...|[-0.1483813281958...| 10091713|
|PSYKOLOGERNE VED ...|    1|[OPLØST EFTER FUS...|[-0.1790030377842...| 10108624|
|         RAH HOLDING|    1|            [NORMAL]|[-0.1483813281958...| 10127351|
|AMAGER BROLÆGGERF...|    0|[OPLØST EFTER KON...|[-0.1177596186074...| 10128587|
|        ART OF JEWEL|    1|            [NORMAL]|[-0.1177596186074...| 10145619|
|    FRONT FREDERICIA|    1|[OPLØST EFTER FUS...|[0.12721405809989...| 10156521|
|M.L.R. AF 16. APR...|    1|[OPLØST EFTER ERK...|[-0.1177596186074...| 10200288|
| FOLKE LARSENS EFTF.|    1|            [NORMAL]|[-0.0258944898421...| 10351235|
|               TAORA|    1|[OPLØST EFTER FUS...|[-0.1790030377842...| 10366380|
|     CH AF 24/8 2000|    1|[OPLØST EFTER FRI...|[0.12721405809989...| 10412986|
|            SIBA-ØST|    1|[OPLØST EFTER FRI...|[-0.1483813281958...| 10481171|
|         STEVNS TRYK|    1|[OPLØST EFTER ERK...|[-0.1483813281958...| 10577675|
|  PERSPEKTIV HOLDING|    1|            [NORMAL]|[-0.1483813281958...| 10606683|
|     INTRUM JUSTITIA|    1|            [NORMAL]|[1.35208244163648...| 10613779|
|         LOBO MØBLER|    1|            [NORMAL]|[-0.1177596186074...| 10629098|
+--------------------+-----+--------------------+--------------------+---------+
only showing top 20 rows


In [16]:
#Train the logistic regressionmodel
lr = LogisticRegression()
grid = (ParamGridBuilder()
        .baseOn({lr.predictionCol:"prediction"})
        .baseOn({lr.rawPredictionCol:"rawPrediction"})
        .baseOn({lr.probabilityCol:"probability"})
        .baseOn({lr.labelCol:"label"})
        .baseOn({lr.featuresCol:"features"})
        .addGrid(param=lr.elasticNetParam,values=[0.1,1.0])
        .addGrid(param=lr.getMaxIter,values=[10])
        .build()
       )
evaluate = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
crossVal = CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluate,numFolds=10)

crossValModel = crossVal.fit(dataset=vectorizedTrainDf)
evaluate.evaluate(crossValModel.transform(vectorizedTestDf))
#coef = lrModel.coefficients


Out[16]:
0.8090805096973791

In [17]:
bestModel = crossValModel.bestModel

In [18]:
#test the values
result = bestModel.transform(vectorizedTestDf)

In [12]:
#

In [19]:
#result.orderBy("prediction").show(100)
confCols = [F.col(i) for i in ["TP","TN","FP","FN"]]


csCols = [F.when((F.col("label")==1) & (F.col("difference") == 0),"TP")
          ,F.when((F.col("label")==0) & (F.col("difference") == 0),"TN")
          ,F.when(F.col("difference") == 1,"FN")
          ,F.when(F.col("difference") == -1,"FP")
         ]

confusionDf = result.select(F.col("label"),F.col("prediction"),(F.col("label")-F.col("prediction")).alias("difference"))
(confusionDf
 .select(F.coalesce(*csCols).alias("cases")  
         #,.otherwise(0).alias("FP")
         #,.otherwise(0).alias("FN")
        )
 .groupBy("cases").count()
).show()


+-----+-----+
|cases|count|
+-----+-----+
|   TP|25710|
|   TN|   36|
|   FN|   46|
|   FP| 6042|
+-----+-----+


In [17]:
crossValModel.bestModel.hasSummary


Out[17]:
True

In [20]:
summary = crossValModel.bestModel.summary

In [24]:
summary.predictions.show()


+--------------------+-----+--------------------+--------------------+---------+--------------------+--------------------+----------+
|                navn|label|              status|            features|cvrNummer|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+---------+--------------------+--------------------+----------+
|SKYTTENS HANDEL O...|    1|[OPLØST EFTER FUS...|[-0.1483813281958...| 10019052|[-0.1436615979228...|[0.46414624375980...|       1.0|
|            DIKI.NET|    0|[OPLØST EFTER KON...|[-0.1483813281958...| 10026113|[-0.2470744652013...|[0.43854370342953...|       1.0|
|                CTEK|    1|[OPLØST EFTER FUS...|[-0.1177596186074...| 10040523|[-0.6803097893868...|[0.33619216432178...|       1.0|
|      VG ENTREPRENØR|    1|            [NORMAL]|[0.43343115398404...| 10057426|[-3.7227933151943...|[0.02359613625108...|       1.0|
|NORDBYENS OLIEFYR...|    1|            [NORMAL]|[-0.1483813281958...| 10089514|[-2.8801667669252...|[0.05314274429565...|       1.0|
|EXPRESS LABELLING...|    1|[TVANGSOPLØST, UN...|[-0.1483813281958...| 10091713|[-0.1715003567869...|[0.45722969080531...|       1.0|
|PSYKOLOGERNE VED ...|    1|[OPLØST EFTER FUS...|[-0.1790030377842...| 10108624|[-0.6485674484631...|[0.34331243211346...|       1.0|
|         RAH HOLDING|    1|            [NORMAL]|[-0.1483813281958...| 10127351|[-2.8804422421872...|[0.05312888447426...|       1.0|
|AMAGER BROLÆGGERF...|    0|[OPLØST EFTER KON...|[-0.1177596186074...| 10128587|[-1.1369952588026...|[0.24287246215155...|       1.0|
|        ART OF JEWEL|    1|            [NORMAL]|[-0.1177596186074...| 10145619|[-6.3724232788323...|[0.00170510286780...|       1.0|
|    FRONT FREDERICIA|    1|[OPLØST EFTER FUS...|[0.12721405809989...| 10156521|[-0.3793527083137...|[0.40628302543127...|       1.0|
|M.L.R. AF 16. APR...|    1|[OPLØST EFTER ERK...|[-0.1177596186074...| 10200288|[-1.4990290785923...|[0.18257037794247...|       1.0|
| FOLKE LARSENS EFTF.|    1|            [NORMAL]|[-0.0258944898421...| 10351235|[-3.6327245272056...|[0.02576276706084...|       1.0|
|               TAORA|    1|[OPLØST EFTER FUS...|[-0.1790030377842...| 10366380|[-0.9451119361175...|[0.27986891278054...|       1.0|
|     CH AF 24/8 2000|    1|[OPLØST EFTER FRI...|[0.12721405809989...| 10412986|[-0.5220910962423...|[0.37236339559707...|       1.0|
|            SIBA-ØST|    1|[OPLØST EFTER FRI...|[-0.1483813281958...| 10481171|[-0.6766429736895...|[0.33701096745397...|       1.0|
|         STEVNS TRYK|    1|[OPLØST EFTER ERK...|[-0.1483813281958...| 10577675|[-0.9080179077065...|[0.28740560581535...|       1.0|
|  PERSPEKTIV HOLDING|    1|            [NORMAL]|[-0.1483813281958...| 10606683|[-4.1379135688213...|[0.01570550931538...|       1.0|
|     INTRUM JUSTITIA|    1|            [NORMAL]|[1.35208244163648...| 10613779|[-2.8641158595488...|[0.05395622053629...|       1.0|
|         LOBO MØBLER|    1|            [NORMAL]|[-0.1177596186074...| 10629098|[-3.4536192522129...|[0.03066110863007...|       1.0|
+--------------------+-----+--------------------+--------------------+---------+--------------------+--------------------+----------+
only showing top 20 rows


In [ ]:


In [ ]: