In [41]:
SQLContext.newSession(sqlContext)
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler,StandardScaler,RFormula
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.linalg import VectorUDT,Vectors
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import Window
from pyspark.ml import Pipeline
from pyspark.ml.regression import GeneralizedLinearRegression


import re
from tabulate import tabulate
import random
import sys
import numpy as np

In [88]:
#import data and rename bad name rank into vaerdiSlope
df = sqlContext.read.parquet("/home/svanhmic/workspace/Python/Erhvervs/data/cdata/featureDataCvr")
df.select(["cvrNummer"])
rankCols = [re.sub(pattern="rank_",repl="vaerdiSlope_",string=i) for i in df.columns ]
renamedDf = (df.withColumn(colName="reklamebeskyttet",col=F.col("reklamebeskyttet").cast("integer"))
             .select([F.col(val).alias(rankCols[idx]) for idx,val in enumerate(df.columns)])
             .withColumn(col=F.col("totalAabneEnheder").cast("double"),colName="totalAabneEnheder")
             .withColumn(col=F.col("totalLukketEnheder").cast("double"),colName="totalLukketEnheder")
             .withColumn(col=F.col("reklamebeskyttet").cast("double"),colName="reklamebeskyttet")
             .withColumn(col=F.col("label").cast("double"),colName="label")
             
             )
renamedDf.show()


+---------+--------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------------+-------------------+-------------+-------------+-------------+-------------+-------------+----------------+
|cvrNummer|  status|label|AarsVaerk_1|AarsVaerk_2|AarsVaerk_3|AarsVaerk_4|AarsVaerk_5|AarsVaerk_6|AarsVaerk_7|AarsVaerk_8|AarsVaerk_9|AarsVaerk_10|AarsVaerk_11|AarsVaerk_12|AarsVaerk_13|AarsVaerk_14|AarsVaerk_15|medArb_1|medArb_2|medArb_3|medArb_4|medArb_5|medArb_6|medArb_7|medArb_8|medArb_9|medArb_10|medArb_11|medArb_12|medArb_13|medArb_14|medArb_15|avgVarighed|totalAabneEnheder|totalLukketEnheder|      vaerdiSlope_1|      vaerdiSlope_2|vaerdiSlope_3|vaerdiSlope_4|vaerdiSlope_5|vaerdiSlope_6|vaerdiSlope_7|reklamebeskyttet|
+---------+--------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------------+-------------------+-------------+-------------+-------------+-------------+-------------+----------------+
| 20172800|[NORMAL]|  1.0|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|       50.0|        20.0|        50.0|        20.0|        20.0|        20.0|        20.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|    50.0|     50.0|     50.0|     20.0|     20.0|     20.0|     20.0|        7.0|              1.0|               1.0| -4497.751124437781|               null|         null|         null|         null|         null|         null|             0.0|
| 14049800|[NORMAL]|  1.0|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|       20.0|        10.0|        10.0|        10.0|        10.0|        10.0|        10.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|    20.0|     20.0|     20.0|     20.0|     10.0|     10.0|     10.0|       null|              0.0|               1.0|  220.4585537918871| 17.921146953405017|         null|         null|         null|         null|         null|             0.0|
| 19298000|[NORMAL]|  1.0|        1.0|        1.0|        1.0|        1.0|        2.0|        2.0|        1.0|        1.0|        2.0|         2.0|         2.0|         5.0|         5.0|         2.0|         2.0|     1.0|     1.0|     1.0|     2.0|     2.0|     2.0|     2.0|     2.0|     2.0|      2.0|      5.0|      5.0|      5.0|      5.0|      5.0|       null|              0.0|               1.0|-108.41507485802788|-126.19888944977284|         null|         null|         null|         null|         null|             1.0|
| 21855200|[NORMAL]|  1.0|        1.0|       10.0|       10.0|       10.0|       10.0|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     2.0|    10.0|    10.0|    10.0|    10.0|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|              0.0|               1.0| 10.933741526350317|               null|         null|         null|         null|         null|         null|             0.0|
| 14786600|[NORMAL]|  1.0|        0.0|        0.0|        1.0|        1.0|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     1.0|     1.0|     0.0|     0.0|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|              0.0|               1.0|               null|               null|         null|         null|         null|         null|         null|             0.0|
| 32556000|[NORMAL]|  1.0|        2.0|        2.0|        1.0|       null|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|    10.0|    10.0|     5.0|    null|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|              0.0|               1.0|               null|               null|         null|         null|         null|         null|         null|             0.0|
| 20308400|[NORMAL]|  1.0|        1.0|        0.0|        0.0|        0.0|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     0.0|     1.0|     1.0|     1.0|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|              0.0|               1.0|               null|               null|         null|         null|         null|         null|         null|             0.0|
| 34455600|[NORMAL]|  1.0|        1.0|       null|       null|       null|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     1.0|    null|    null|    null|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|              0.0|               1.0|               null|               null|         null|         null|         null|         null|         null|             0.0|
| 10944600|[NORMAL]|  1.0|        2.0|        2.0|        2.0|        2.0|        2.0|        5.0|        5.0|        5.0|        5.0|        10.0|        10.0|        20.0|        20.0|        10.0|        10.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|     5.0|      5.0|     10.0|     20.0|     20.0|     20.0|     10.0|       null|              0.0|               1.0|  83.05647840531562|   96.3275135460566|         null|         null|         null|         null|         null|             0.0|
| 13009600|[NORMAL]|  1.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         5.0|         5.0|         5.0|         5.0|         5.0|    10.0|     5.0|    10.0|    10.0|    10.0|     5.0|     5.0|    10.0|    10.0|      5.0|      5.0|     10.0|      5.0|     10.0|      5.0|       null|              0.0|               1.0|    761.03500761035|  76.27765064836004|         null|         null|         null|         null|         null|             1.0|
| 27585000|[NORMAL]|  1.0|        1.0|        1.0|        1.0|       null|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     1.0|     1.0|     1.0|    null|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|              0.0|               1.0|               null|               null|         null|         null|         null|         null|         null|             0.0|
| 13341400|[NORMAL]|  1.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|        5.0|       10.0|        10.0|        10.0|        10.0|         5.0|         5.0|         5.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|     10.0|     10.0|     10.0|     10.0|     10.0|     10.0|       null|              0.0|               1.0| 158.73015873015873|  319.3033381712627|         null|         null|         null|         null|         null|             0.0|
| 14876200|[NORMAL]|  1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|         1.0|         1.0|         1.0|         1.0|         1.0|        null|     1.0|     1.0|     0.0|     2.0|     1.0|     2.0|     2.0|     2.0|     1.0|      1.0|      1.0|      1.0|      1.0|      2.0|     null|       null|              0.0|               1.0|   21.1864406779661|               null|         null|         null|         null|         null|         null|             0.0|
| 14290400|[NORMAL]|  1.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         5.0|         5.0|         5.0|         5.0|         5.0|     5.0|     5.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|     10.0|     10.0|     10.0|     10.0|     10.0|     10.0|       null|              0.0|               1.0|               null|               null|         null|         null|         null|         null|         null|             0.0|
| 25108000|[NORMAL]|  1.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         2.0|         2.0|        null|        null|        null|     5.0|     5.0|     5.0|    10.0|     5.0|     5.0|     5.0|     5.0|     5.0|      5.0|      5.0|      5.0|     null|     null|     null|       null|              0.0|               1.0|               null|               null|         null|         null|         null|         null|         null|             0.0|
| 16981400|[NORMAL]|  1.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|         5.0|         5.0|         5.0|         5.0|         5.0|         5.0|     5.0|     5.0|     5.0|     5.0|     5.0|    10.0|     5.0|     5.0|     5.0|      5.0|      5.0|      5.0|      5.0|      5.0|      5.0|       null|              0.0|               1.0|               null|               null|         null|         null|         null|         null|         null|             0.0|
| 12910800|[NORMAL]|  1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|        1.0|         1.0|         1.0|         1.0|         1.0|         1.0|         1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     1.0|     2.0|      2.0|      2.0|      2.0|      2.0|      2.0|      2.0|       null|              0.0|               1.0|  42.71982912068352|               null|         null|         null|         null|         null|         null|             0.0|
| 21827800|[NORMAL]|  1.0|       20.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|       10.0|        10.0|        10.0|         5.0|         5.0|        null|        null|    20.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|    10.0|     10.0|     10.0|      5.0|     10.0|     null|     null|       null|              0.0|               2.0|               null|               null|         null|         null|         null|         null|         null|             0.0|
| 25548000|[NORMAL]|  1.0|        5.0|        5.0|        5.0|        5.0|        5.0|        5.0|       null|       null|       null|        null|        null|        null|        null|        null|        null|    10.0|    10.0|    10.0|     5.0|    10.0|    10.0|    null|    null|    null|     null|     null|     null|     null|     null|     null|      986.5|              2.0|               1.0|               null|               null|         null|         null|         null|         null|         null|             0.0|
| 33065000|[NORMAL]|  1.0|        2.0|        5.0|       null|       null|       null|       null|       null|       null|       null|        null|        null|        null|        null|        null|        null|     0.0|    10.0|    null|    null|    null|    null|    null|    null|    null|     null|     null|     null|     null|     null|     null|       null|              0.0|               1.0|               null|               null|         null|         null|         null|         null|         null|             0.0|
+---------+--------+-----+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------------+-------------------+-------------+-------------+-------------+-------------+-------------+----------------+
only showing top 20 rows


In [ ]:


In [ ]:


In [99]:
strs = ""
excludedCols = ["medArb_"+str(i) for i in range(1,16)]+["cvrNummer","label","status"]
for i in renamedDf.columns:
    if i not in excludedCols:
        strs += i+" + "

#excludedCols    
imputedDf = renamedDf.fillna(value=0.0)
formula = RFormula(formula="label ~ "+strs[:-3],labelCol="label")

glr = GeneralizedLinearRegression(family="binomial", link="logit", maxIter=10, regParam=0.3)
print(glr.
lr = LogisticRegression()

pipeline = Pipeline(stages=[formula,glr])

grid = (ParamGridBuilder()
        .baseOn({lr.predictionCol:"prediction"})
        .baseOn({lr.rawPredictionCol:"rawPrediction"})
        .baseOn({lr.probabilityCol:"probability"})
        .baseOn({lr.labelCol:"label"})
        .baseOn({lr.featuresCol:"features"})
        .addGrid(param=lr.elasticNetParam,values=[0.1,1.0])
        .addGrid(param=lr.getMaxIter,values=[10])
        .build()
       )

evaluate = BinaryClassificationEvaluator()

trainEvalModel = TrainValidationSplit(estimator=pipeline,estimatorParamMaps=grid,evaluator=evaluate,trainRatio=0.8)


prediction

In [96]:
cols = [i for i in renamedDf.columns if i not in excludedCols]+["label"]

model = trainEvalModel.fit(imputedDf.select(*cols).filter(F.col("label") <= 1))


---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
/usr/local/share/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:

/usr/local/share/spark/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    311                     "An error occurred while calling {0}{1}{2}.\n".
--> 312                     format(target_id, ".", name), value)
    313             else:

Py4JJavaError: An error occurred while calling o2716.evaluate.
: java.lang.IllegalArgumentException: Field "rawPrediction" does not exist.
	at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:228)
	at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:228)
	at scala.collection.MapLike$class.getOrElse(MapLike.scala:128)
	at scala.collection.AbstractMap.getOrElse(Map.scala:59)
	at org.apache.spark.sql.types.StructType.apply(StructType.scala:227)
	at org.apache.spark.ml.util.SchemaUtils$.checkColumnTypes(SchemaUtils.scala:56)
	at org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate(BinaryClassificationEvaluator.scala:76)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:211)
	at java.lang.Thread.run(Thread.java:745)


During handling of the above exception, another exception occurred:

IllegalArgumentException                  Traceback (most recent call last)
<ipython-input-96-4946d3f6535c> in <module>()
      1 cols = [i for i in renamedDf.columns if i not in excludedCols]+["label"]
      2 
----> 3 model = trainEvalModel.fit(imputedDf.select(*cols).filter(F.col("label") <= 1))

/usr/local/share/spark/python/pyspark/ml/base.py in fit(self, dataset, params)
     62                 return self.copy(params)._fit(dataset)
     63             else:
---> 64                 return self._fit(dataset)
     65         else:
     66             raise ValueError("Params must be either a param map or a list/tuple of param maps, "

/usr/local/share/spark/python/pyspark/ml/tuning.py in _fit(self, dataset)
    380         for j in range(numModels):
    381             model = est.fit(train, epm[j])
--> 382             metric = eva.evaluate(model.transform(validation, epm[j]))
    383             metrics[j] += metric
    384         if eva.isLargerBetter():

/usr/local/share/spark/python/pyspark/ml/evaluation.py in evaluate(self, dataset, params)
     66                 return self.copy(params)._evaluate(dataset)
     67             else:
---> 68                 return self._evaluate(dataset)
     69         else:
     70             raise ValueError("Params must be a param map but got %s." % type(params))

/usr/local/share/spark/python/pyspark/ml/evaluation.py in _evaluate(self, dataset)
     96         """
     97         self._transfer_params_to_java()
---> 98         return self._java_obj.evaluate(dataset._jdf)
     99 
    100     def isLargerBetter(self):

/usr/local/share/spark/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
    931         answer = self.gateway_client.send_command(command)
    932         return_value = get_return_value(
--> 933             answer, self.gateway_client, self.target_id, self.name)
    934 
    935         for temp_arg in temp_args:

/usr/local/share/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     77                 raise QueryExecutionException(s.split(': ', 1)[1], stackTrace)
     78             if s.startswith('java.lang.IllegalArgumentException: '):
---> 79                 raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
     80             raise
     81     return deco

IllegalArgumentException: 'Field "rawPrediction" does not exist.'

In [91]:
predict = model..transform(imputedDf.select(*cols).filter(F.col("label") <= 1))
imputedDf.select(*cols).filter(F.col("label") <= 1).printSchema()


root
 |-- AarsVaerk_1: double (nullable = false)
 |-- AarsVaerk_2: double (nullable = false)
 |-- AarsVaerk_3: double (nullable = false)
 |-- AarsVaerk_4: double (nullable = false)
 |-- AarsVaerk_5: double (nullable = false)
 |-- AarsVaerk_6: double (nullable = false)
 |-- AarsVaerk_7: double (nullable = false)
 |-- AarsVaerk_8: double (nullable = false)
 |-- AarsVaerk_9: double (nullable = false)
 |-- AarsVaerk_10: double (nullable = false)
 |-- AarsVaerk_11: double (nullable = false)
 |-- AarsVaerk_12: double (nullable = false)
 |-- AarsVaerk_13: double (nullable = false)
 |-- AarsVaerk_14: double (nullable = false)
 |-- AarsVaerk_15: double (nullable = false)
 |-- avgVarighed: double (nullable = false)
 |-- totalAabneEnheder: double (nullable = false)
 |-- totalLukketEnheder: double (nullable = false)
 |-- vaerdiSlope_1: double (nullable = false)
 |-- vaerdiSlope_2: double (nullable = false)
 |-- vaerdiSlope_3: double (nullable = false)
 |-- vaerdiSlope_4: double (nullable = false)
 |-- vaerdiSlope_5: double (nullable = false)
 |-- vaerdiSlope_6: double (nullable = false)
 |-- vaerdiSlope_7: double (nullable = false)
 |-- reklamebeskyttet: double (nullable = false)
 |-- label: double (nullable = false)


In [95]:
p = model.stages[1].summary

print("Coefficient Standard Errors: " + str(p.coefficientStandardErrors))
print("T Values: " + str(p.tValues))
print("P Values: " + str(p.pValues))
print("Dispersion: " + str(p.dispersion))
print("Null Deviance: " + str(p.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(p.residualDegreeOfFreedomNull))
print("Deviance: " + str(p.deviance))
print("Residual Degree Of Freedom: " + str(p.residualDegreeOfFreedom))
print("AIC: " + str(p.aic))
print("Deviance Residuals: ")
p.residuals().show()


Coefficient Standard Errors: [0.00060663308727155, 0.0007519666947785978, 0.000711341189202029, 0.0006439644831685794, 0.0008270035385413254, 0.0008046613897137305, 0.0010321426755108202, 0.0009805126061947754, 0.0011715362209909488, 0.0014315023877709083, 0.0016039269138056539, 0.0014605483890031716, 0.0014580786819210197, 0.0017614815929307924, 0.0014442858090305548, 1.4776939364887241e-06, 0.0024527979391702794, 0.005641754851351828, 2.1602125780588623e-08, 4.482988627440235e-08, 1.941301702726079e-07, 2.0998993359408759e-07, 6.902319351199482e-07, 1.4340377734315118e-05, 7.894013037571729e-06, 0.006566878379619086, 0.0063388762292734815]
T Values: [-283.03449896138227, 334.1714838200412, -109.68587851294157, -139.94269501038147, -179.80231389454045, 31.057119087710777, 48.42007280143863, 73.35819928413993, -101.56605125515793, -76.19650268271621, -83.18425599472717, 169.47191611885378, 109.72038472354501, -449.21664647282574, 377.14061210108883, 12.46005415872235, 68.8799591490644, 49.05598234468639, -633.8841577938279, 53.000340138228864, -197.25697275412088, 1087.1417532366233, -437.861577996344, 3465.9420471210274, -2803.8425688495117, 22.341417917833272, 77.69079487436609]
P Values: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Dispersion: 1.0
Null Deviance: 401241.2346208986
Residual Degree Of Freedom Null: 315725
Deviance: 800985.790291175
Residual Degree Of Freedom: 315699
AIC: 801039.790291175
Deviance Residuals: 
+------------------+
| devianceResiduals|
+------------------+
| 5.495744814729124|
| 2.915752450551924|
|0.6276935306296806|
|1.2022015391153766|
|0.9349878726666203|
|0.8435041663619414|
|0.9363351153560424|
|0.9363351153560424|
| 0.221823538936251|
| 1.417611165164476|
|0.8720615036837311|
| 2.091462730558736|
|1.2058553056300734|
|1.4788191258410732|
|1.4380863572855533|
|1.4788191258410732|
|0.9823909475371032|
| 2.527356725983328|
|1.1503602272879747|
|0.5840304176284861|
+------------------+
only showing top 20 rows


In [3]:
#check null values.
descriptionCVR = renamedDf.describe()
descriptionCVR.select("summary").show()
descriptionCVR.filter(F.col("summary")=="count").show()


+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

+-------+---------+------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+----------------+
|summary|cvrNummer| label|AarsVaerk_1|AarsVaerk_2|AarsVaerk_3|AarsVaerk_4|AarsVaerk_5|AarsVaerk_6|AarsVaerk_7|AarsVaerk_8|AarsVaerk_9|AarsVaerk_10|AarsVaerk_11|AarsVaerk_12|AarsVaerk_13|AarsVaerk_14|AarsVaerk_15|medArb_1|medArb_2|medArb_3|medArb_4|medArb_5|medArb_6|medArb_7|medArb_8|medArb_9|medArb_10|medArb_11|medArb_12|medArb_13|medArb_14|medArb_15|avgVarighed|totalAabneEnheder|totalLukketEnheder|vaerdiSlope_1|vaerdiSlope_2|vaerdiSlope_3|vaerdiSlope_4|vaerdiSlope_5|vaerdiSlope_6|vaerdiSlope_7|reklamebeskyttet|
+-------+---------+------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+----------------+
|  count|   317827|317827|     317827|     269143|     210742|     174464|     143649|     118501|      99374|      84004|      69626|       59566|       50753|       43772|       36725|       28263|       22737|  317827|  269143|  210742|  174464|  143649|  118501|   99374|   84004|   69626|    59566|    50753|    43772|    36725|    28263|    22737|     234873|           317827|            317827|       121067|        45698|        19630|         9925|         5622|         3499|         2357|          317827|
+-------+---------+------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+------------+------------+------------+------------+------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+-----------+-----------------+------------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+----------------+


In [4]:
#check mean and stddev
descriptionCVR.filter((F.col("summary") =="mean") | (F.col("summary") =="stddev")).show()


+-------+-------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+-----------------+-------------------+------------------+------------------+------------------+-------------------+
|summary|          cvrNummer|             label|       AarsVaerk_1|       AarsVaerk_2|      AarsVaerk_3|      AarsVaerk_4|       AarsVaerk_5|       AarsVaerk_6|      AarsVaerk_7|       AarsVaerk_8|       AarsVaerk_9|      AarsVaerk_10|      AarsVaerk_11|      AarsVaerk_12|      AarsVaerk_13|      AarsVaerk_14|      AarsVaerk_15|          medArb_1|         medArb_2|         medArb_3|         medArb_4|          medArb_5|          medArb_6|          medArb_7|          medArb_8|          medArb_9|         medArb_10|         medArb_11|        medArb_12|         medArb_13|         medArb_14|        medArb_15|       avgVarighed| totalAabneEnheder| totalLukketEnheder|    vaerdiSlope_1|     vaerdiSlope_2|    vaerdiSlope_3|      vaerdiSlope_4|     vaerdiSlope_5|     vaerdiSlope_6|     vaerdiSlope_7|   reklamebeskyttet|
+-------+-------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+-----------------+-------------------+------------------+------------------+------------------+-------------------+
|   mean|2.992587977176892E7|0.6770318443681625| 4.963124592938926| 6.599220488736471|7.883416689601503|9.083146093176817| 9.771213165424054|10.298917308714694|10.77020146114678|11.333662682729393|11.979418607991267|12.565540744720142|13.190747344984533|13.685712327515306|14.092008168822328|15.306301524961965|15.918722786647315| 5.503519210136332|8.213265810368465|9.818617076804813|11.16323711482025|12.375860604668324|12.991983190015274|13.533087125405036|13.999154802152278|14.809654439433546|15.498606587650674| 16.10517604870648|16.86322306497304|17.443267528931244|18.770017337154584|19.24664643532568|3104.0394283223163|1.1975634543320737|0.42037334776466445|9367.222088225697|14773.935940526042|37270.54804289701|  534722.7367923224| 69479.75111113006| 79309.96883183606|246815.61845150983|0.05565606446274231|
| stddev|1.595872029695645E7|0.4815407404508707|28.219409782015937|31.349174803319965|36.38636236195736|40.84287153372559|43.053166612742295| 43.50716762822306|44.69115651768418| 46.24885171030284| 48.03721081702004|49.663482624596746|   50.952745810432|51.757992157941516| 51.79592907581495|57.952550432313735|59.896979275308205|31.469466071930402| 35.7271436211818|40.94583736584511| 45.5828092120186| 48.68024770176678| 49.50427185113245|50.816539411344706| 52.01619572491007|54.732773025784745|  56.2466055958348|56.478429021321915|58.18250553629226| 59.22942233650709| 64.22850620145455|64.30084551221002| 2826.832782887831| 4.297482547820878|  2.316930888416194|1104747.513966895|485532.34538503713|947528.7880333586|4.535843433554792E7|1192985.3794908465|1020811.2995945786| 4224380.725125994|0.22925669525652828|
+-------+-------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+-----------------+-------------------+------------------+------------------+------------------+-------------------+


In [5]:
windowSpecRank =(Window.partitionBy(F.col("cvrNummer"))).orderBy(F.col("gyldigFra").desc())

groupCols = ["cvrNummer","vaerdi"]

companyNameDf = (sqlContext
                 .read
                 .parquet("/home/svanhmic/workspace/Python/Erhvervs/data/cdata/"+"companyCvrData")
                 .withColumn(colName="rank",col=F.rank().over(windowSpecRank))
                 .filter((F.col("rank")==1) & (F.col("sekvensnr")==0))
                 .select([F.col(i) for i in groupCols])
                 .withColumnRenamed(existing="vaerdi",new="navn")
                 .orderBy(F.col("cvrNummer"))
                 .cache()
                )
companyNameDf.show(2)


+---------+--------------------+
|cvrNummer|                navn|
+---------+--------------------+
| 10000009|              YELLOW|
| 10000025|WATERFRONT CONNEC...|
+---------+--------------------+
only showing top 2 rows


In [6]:
#take ln(x+1) of features

labelCols = ["cvrNummer","label","status"]
logFeatCols = [i for i in renamedDf.columns if i not in labelCols]
#print(logFeatCols)
mininum = descriptionCVR.filter(F.col("summary")=="min").collect()[0]
#print(mininum)


logDf = (renamedDf
         .select([F.col("cvrNummer"),F.col("label")]+[F.log1p(F.col(i)-F.lit(mininum[i])).alias(i) for i in logFeatCols])
         .na
         .fill(0.0,logFeatCols)
         
        )
#logDf.show(2)


#First convert features to vetor
toDenseUDf = F.udf(lambda x: Vectors.dense(x.toArray()),VectorUDT())
vectorizer = VectorAssembler(inputCols=logFeatCols,outputCol="features")

rawVectorDataDf = (vectorizer.transform(renamedDf                                                                               
                                        .join(companyNameDf,(companyNameDf["cvrNummer"]==renamedDf["cvrNummer"]),"inner")
                                        .drop(companyNameDf["cvrNummer"])
                                        #.select(*logColsSelected) 
                                        .na
                                        .fill(0.0,logFeatCols)
                                        .distinct()
                                       )
                   .select(["navn"]+labelCols+[toDenseUDf(vectorizer.getOutputCol()).alias(vectorizer.getOutputCol())])
                  )

standardScale = StandardScaler(withMean=True,withStd=True,inputCol=vectorizer.getOutputCol(),outputCol="scaledFeatures")
standardScaleModel = standardScale.fit(rawVectorDataDf)
scaledFeaturesDf = (standardScaleModel
                    .transform(rawVectorDataDf)
                    .drop("features")
                    .withColumnRenamed(existing="scaledFeatures",new="features")
                   )

scaledFeaturesDf.show()


+--------------------+---------+-----+--------------------+--------------------+
|                navn|cvrNummer|label|              status|            features|
+--------------------+---------+-----+--------------------+--------------------+
|   EMPAKA KARTONNAGE| 10016533|    1|            [NORMAL]|[0.12721405809989...|
|      DET GAMLE GODS| 10016606|    1|            [NORMAL]|[-0.1483813281958...|
|               DIXEN| 10018064|    0|[OPLØST EFTER KON...|[-0.1483813281958...|
|        TRELBORG VVS| 10063760|    1|            [NORMAL]|[-0.1483813281958...|
|            CYBERSUN| 10065917|    0|[OPLØST EFTER KON...|[-0.0258944898421...|
|                HME2| 10080207|    1|            [NORMAL]|[-0.1483813281958...|
|TØMRERFIRMAET HER...| 10082528|    1|            [NORMAL]|[-0.1177596186074...|
|    MØRKHOLT VINDUER| 10096227|    1|            [NORMAL]|[-0.1483813281958...|
|                KISØ| 10108993|    1|            [NORMAL]|[-0.1177596186074...|
|               C-CUT| 10117267|    1|[OPLØST EFTER ERK...|[-0.1483813281958...|
|          HLS INVEST| 10120829|    0|[OPLØST EFTER KON...|[-0.1483813281958...|
|       ELKA RAINWEAR| 10138698|    1|            [NORMAL]|[0.12721405809989...|
|              BØMLER| 10142539|    0|[OPLØST EFTER KON...|[-0.1483813281958...|
|     NORDISKE MEDIER| 10150825|    1|            [NORMAL]|[-0.1177596186074...|
|  SBR POUL-ERIK KJÆR| 10178509|    0|[OPLØST EFTER KON...|[-0.1483813281958...|
|         SCAN ORIENT| 10228476|    1|[TVANGSOPLØST, UN...|[-0.1483813281958...|
|           AQUA-NORD| 10310989|    1|[OPLØST EFTER FRI...|[-0.1177596186074...|
|        IKAST ETIKET| 10402980|    1|            [NORMAL]|[-0.1177596186074...|
|BRDR. JØRGENSEN C...| 10518431|    1|            [NORMAL]|[0.43343115398404...|
|         AXEL MADSEN| 10553490|    1|            [NORMAL]|[-0.0258944898421...|
+--------------------+---------+-----+--------------------+--------------------+
only showing top 20 rows


In [14]:
#put them into a feature vecto
vectorizedTestDf = scaledFeaturesDf.filter(F.col("label") <= 1).sampleBy("label", fractions={0: 0.2, 1: 0.2}, seed=42)
vectorizedTestDf.groupBy("label").count().show()

scaledCvrDf = scaledFeaturesDf.select(F.col("cvrNummer"))
cvrTestDf = vectorizedTestDf.select("cvrNummer")
cvrTrainDf = scaledCvrDf.subtract(cvrTestDf) #take the other partion as training set

vectorizedTrainDf = (scaledFeaturesDf
                     .filter(F.col("label") <= 1)
                     .join(cvrTrainDf,(scaledFeaturesDf["cvrNummer"] == cvrTrainDf["cvrNummer"]),"inner")
                     .drop(cvrTrainDf["cvrNummer"])
                    )
vectorizedTrainDf.groupBy("label").count().show()
print("Number of data points: "+str(scaledFeaturesDf.count()))
print("Number of data points train: "+str(vectorizedTrainDf.select("cvrNummer").count()))
print("Number of data points test: "+str(vectorizedTestDf.select("cvrNummer").count()))
#vectorizedTrainDf.printSchema()
#print(vectorizedTrainDf.first())


+-----+-----+
|label|count|
+-----+-----+
|    0| 6078|
|    1|25756|
+-----+-----+

+-----+------+
|label| count|
+-----+------+
|    0| 24465|
|    1|103472|
+-----+------+

Number of data points: 160648
Number of data points train: 127937
Number of data points test: 31834

In [15]:
vectorizedTrainDf.show()


+--------------------+-----+--------------------+--------------------+---------+
|                navn|label|              status|            features|cvrNummer|
+--------------------+-----+--------------------+--------------------+---------+
|SKYTTENS HANDEL O...|    1|[OPLØST EFTER FUS...|[-0.1483813281958...| 10019052|
|            DIKI.NET|    0|[OPLØST EFTER KON...|[-0.1483813281958...| 10026113|
|                CTEK|    1|[OPLØST EFTER FUS...|[-0.1177596186074...| 10040523|
|      VG ENTREPRENØR|    1|            [NORMAL]|[0.43343115398404...| 10057426|
|NORDBYENS OLIEFYR...|    1|            [NORMAL]|[-0.1483813281958...| 10089514|
|EXPRESS LABELLING...|    1|[TVANGSOPLØST, UN...|[-0.1483813281958...| 10091713|
|PSYKOLOGERNE VED ...|    1|[OPLØST EFTER FUS...|[-0.1790030377842...| 10108624|
|         RAH HOLDING|    1|            [NORMAL]|[-0.1483813281958...| 10127351|
|AMAGER BROLÆGGERF...|    0|[OPLØST EFTER KON...|[-0.1177596186074...| 10128587|
|        ART OF JEWEL|    1|            [NORMAL]|[-0.1177596186074...| 10145619|
|    FRONT FREDERICIA|    1|[OPLØST EFTER FUS...|[0.12721405809989...| 10156521|
|M.L.R. AF 16. APR...|    1|[OPLØST EFTER ERK...|[-0.1177596186074...| 10200288|
| FOLKE LARSENS EFTF.|    1|            [NORMAL]|[-0.0258944898421...| 10351235|
|               TAORA|    1|[OPLØST EFTER FUS...|[-0.1790030377842...| 10366380|
|     CH AF 24/8 2000|    1|[OPLØST EFTER FRI...|[0.12721405809989...| 10412986|
|            SIBA-ØST|    1|[OPLØST EFTER FRI...|[-0.1483813281958...| 10481171|
|         STEVNS TRYK|    1|[OPLØST EFTER ERK...|[-0.1483813281958...| 10577675|
|  PERSPEKTIV HOLDING|    1|            [NORMAL]|[-0.1483813281958...| 10606683|
|     INTRUM JUSTITIA|    1|            [NORMAL]|[1.35208244163648...| 10613779|
|         LOBO MØBLER|    1|            [NORMAL]|[-0.1177596186074...| 10629098|
+--------------------+-----+--------------------+--------------------+---------+
only showing top 20 rows


In [16]:
#Train the logistic regressionmodel
lr = LogisticRegression()
grid = (ParamGridBuilder()
        .baseOn({lr.predictionCol:"prediction"})
        .baseOn({lr.rawPredictionCol:"rawPrediction"})
        .baseOn({lr.probabilityCol:"probability"})
        .baseOn({lr.labelCol:"label"})
        .baseOn({lr.featuresCol:"features"})
        .addGrid(param=lr.elasticNetParam,values=[0.1,1.0])
        .addGrid(param=lr.getMaxIter,values=[10])
        .build()
       )
evaluate = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
crossVal = CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluate,numFolds=10)

crossValModel = crossVal.fit(dataset=vectorizedTrainDf)
evaluate.evaluate(crossValModel.transform(vectorizedTestDf))
#coef = lrModel.coefficients


Out[16]:
0.8090805096973791

In [17]:
bestModel = crossValModel.bestModel

In [18]:
#test the values
result = bestModel.transform(vectorizedTestDf)

In [12]:
#

In [19]:
#result.orderBy("prediction").show(100)
confCols = [F.col(i) for i in ["TP","TN","FP","FN"]]


csCols = [F.when((F.col("label")==1) & (F.col("difference") == 0),"TP")
          ,F.when((F.col("label")==0) & (F.col("difference") == 0),"TN")
          ,F.when(F.col("difference") == 1,"FN")
          ,F.when(F.col("difference") == -1,"FP")
         ]

confusionDf = result.select(F.col("label"),F.col("prediction"),(F.col("label")-F.col("prediction")).alias("difference"))
(confusionDf
 .select(F.coalesce(*csCols).alias("cases")  
         #,.otherwise(0).alias("FP")
         #,.otherwise(0).alias("FN")
        )
 .groupBy("cases").count()
).show()


+-----+-----+
|cases|count|
+-----+-----+
|   TP|25710|
|   TN|   36|
|   FN|   46|
|   FP| 6042|
+-----+-----+


In [17]:
crossValModel.bestModel.hasSummary


Out[17]:
True

In [20]:
summary = crossValModel.bestModel.summary

In [24]:
summary.predictions.show()


+--------------------+-----+--------------------+--------------------+---------+--------------------+--------------------+----------+
|                navn|label|              status|            features|cvrNummer|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+---------+--------------------+--------------------+----------+
|SKYTTENS HANDEL O...|    1|[OPLØST EFTER FUS...|[-0.1483813281958...| 10019052|[-0.1436615979228...|[0.46414624375980...|       1.0|
|            DIKI.NET|    0|[OPLØST EFTER KON...|[-0.1483813281958...| 10026113|[-0.2470744652013...|[0.43854370342953...|       1.0|
|                CTEK|    1|[OPLØST EFTER FUS...|[-0.1177596186074...| 10040523|[-0.6803097893868...|[0.33619216432178...|       1.0|
|      VG ENTREPRENØR|    1|            [NORMAL]|[0.43343115398404...| 10057426|[-3.7227933151943...|[0.02359613625108...|       1.0|
|NORDBYENS OLIEFYR...|    1|            [NORMAL]|[-0.1483813281958...| 10089514|[-2.8801667669252...|[0.05314274429565...|       1.0|
|EXPRESS LABELLING...|    1|[TVANGSOPLØST, UN...|[-0.1483813281958...| 10091713|[-0.1715003567869...|[0.45722969080531...|       1.0|
|PSYKOLOGERNE VED ...|    1|[OPLØST EFTER FUS...|[-0.1790030377842...| 10108624|[-0.6485674484631...|[0.34331243211346...|       1.0|
|         RAH HOLDING|    1|            [NORMAL]|[-0.1483813281958...| 10127351|[-2.8804422421872...|[0.05312888447426...|       1.0|
|AMAGER BROLÆGGERF...|    0|[OPLØST EFTER KON...|[-0.1177596186074...| 10128587|[-1.1369952588026...|[0.24287246215155...|       1.0|
|        ART OF JEWEL|    1|            [NORMAL]|[-0.1177596186074...| 10145619|[-6.3724232788323...|[0.00170510286780...|       1.0|
|    FRONT FREDERICIA|    1|[OPLØST EFTER FUS...|[0.12721405809989...| 10156521|[-0.3793527083137...|[0.40628302543127...|       1.0|
|M.L.R. AF 16. APR...|    1|[OPLØST EFTER ERK...|[-0.1177596186074...| 10200288|[-1.4990290785923...|[0.18257037794247...|       1.0|
| FOLKE LARSENS EFTF.|    1|            [NORMAL]|[-0.0258944898421...| 10351235|[-3.6327245272056...|[0.02576276706084...|       1.0|
|               TAORA|    1|[OPLØST EFTER FUS...|[-0.1790030377842...| 10366380|[-0.9451119361175...|[0.27986891278054...|       1.0|
|     CH AF 24/8 2000|    1|[OPLØST EFTER FRI...|[0.12721405809989...| 10412986|[-0.5220910962423...|[0.37236339559707...|       1.0|
|            SIBA-ØST|    1|[OPLØST EFTER FRI...|[-0.1483813281958...| 10481171|[-0.6766429736895...|[0.33701096745397...|       1.0|
|         STEVNS TRYK|    1|[OPLØST EFTER ERK...|[-0.1483813281958...| 10577675|[-0.9080179077065...|[0.28740560581535...|       1.0|
|  PERSPEKTIV HOLDING|    1|            [NORMAL]|[-0.1483813281958...| 10606683|[-4.1379135688213...|[0.01570550931538...|       1.0|
|     INTRUM JUSTITIA|    1|            [NORMAL]|[1.35208244163648...| 10613779|[-2.8641158595488...|[0.05395622053629...|       1.0|
|         LOBO MØBLER|    1|            [NORMAL]|[-0.1177596186074...| 10629098|[-3.4536192522129...|[0.03066110863007...|       1.0|
+--------------------+-----+--------------------+--------------------+---------+--------------------+--------------------+----------+
only showing top 20 rows


In [ ]:


In [ ]: