In [1]:
%matplotlib inline

# Enable Retina mode for higher-res on Macbook's with Retina Display
%config InlineBackend.figure_format = 'retina'

In [2]:
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt

import seaborn as sns
from pyspark.sql import SQLContext
from pyspark.sql.types import *

In [3]:
itemsDF = sqlContext.read.format("json") \
            .load("file:/root/pipeline/myapps/html/advancedspark.com/json/software.json") \
            .select("id", "title", "category", "description")

In [4]:
import pyspark.sql.functions as func

categoriesDF = itemsDF.select("category") \
                 .groupBy("category") \
                 .count() \
                 .orderBy("count", ascending=False) \
                 .filter("count > 10") \
                 .toPandas()

categoriesDF


Out[4]:
category count
0 Library 15
1 Database 12
2 Data Processing 11

In [5]:
sns.set_style("whitegrid")
plot = sns.barplot(x="category", y="count", data=categoriesDF)



In [6]:
from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer(inputCol = "description", \
                           outputCol="words", \
                           gaps=False, \
                           pattern="\\p{L}+")
tokenizer


Out[6]:
RegexTokenizer_4824b0ee97579ac11a71

In [7]:
from pyspark.ml.feature import StopWordsRemover

stopWordsFilter = StopWordsRemover(inputCol = "words", \
                                   outputCol = "filteredWords", \
                                   caseSensitive = False)
stopWordsFilter


Out[7]:
StopWordsRemover_49e08ec5b8fe56255763

In [8]:
from pyspark.ml.feature import HashingTF

tf = HashingTF(inputCol = "filteredWords", \
               outputCol = "tfFeatures")
                                   
tf


Out[8]:
HashingTF_4eb9936b910c2f5adb58

In [9]:
from pyspark.ml.feature import IDF

idf = IDF(inputCol = "tfFeatures", \
          outputCol = "idfFeatures")
                                   
idf


Out[9]:
IDF_47eab0caf47f6c97534a

In [10]:
from pyspark.ml.feature import StringIndexer

categoryIndexer = StringIndexer(inputCol = "category", \
                                outputCol = "indexedCategory") 

categoryIndexerModel = categoryIndexer.fit(itemsDF)

categoryIndexerModel


Out[10]:
StringIndexer_45f4a10a18894adcf517

In [11]:
from pyspark.ml.classification import RandomForestClassifier

classifier = RandomForestClassifier(featuresCol = "idfFeatures", \
                                    labelCol = "indexedCategory", \
                                    predictionCol = "prediction", \
                                    rawPredictionCol = "confidence", \
                                    probabilityCol = "probability")
classifier


Out[11]:
RandomForestClassifier_41cc86e845efe481d6a6

In [12]:
from pyspark.ml.feature import IndexToString

categoryReverseIndexer = IndexToString(inputCol = "prediction", \
                                       outputCol = "predictedCategory", \
                                       labels = categoryIndexerModel.labels)

categoryReverseIndexer


Out[12]:
IndexToString_45b0a3eb14e16e443300

In [13]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = [tokenizer, stopWordsFilter, tf, idf, categoryIndexer, classifier, categoryReverseIndexer])

pipeline


Out[13]:
Pipeline_496bba6ea793a2c9ad35

In [14]:
# TODO:  Implement Cross Validation and Grid Search

In [15]:
pipelineModel = pipeline.fit(itemsDF)

In [16]:
predictionsDF = pipelineModel.transform(itemsDF)

predictionsDF.select("title", "category", "predictedCategory").toPandas()


Out[16]:
title category predictedCategory
0 Apache Cassandra Database Database
1 Tachyon Distributed Cache Library
2 Apache Ambari Cluster Provision Library
3 Docker Container Library
4 Microsft Azure Cloud Provider Database
5 Apache Flink Data Processing Library
6 Apache Spark Data Processing Library
7 Apache Flume Library Library
8 Apache Giraph Library Library
9 Apache HDFS File System Library
10 Apache YARN Cluster Resource Manager Library
11 Apache HBase Database Library
12 Apache MapReduce Data Processing Library
13 Apache Hive Data Processing Library
14 Hortonworks Distribution Library
15 Apache HUE UI Library
16 Apache Impala Data Processing Library
17 Apache Kafka Message Broker Library
18 Apache Lucene Library Library
19 Apache Solr Search Engine Library
20 ElasticSearch Search Engine Library
21 Apache Mahout Library Library
22 Apache Drill Data Processing Data Processing
23 Apache Mesos Cluster Resource Manager Library
24 Apache Parquet File Format Database
25 Apache ORC File Format Library
26 Apache Pig Data Processing Library
27 Apache ZooKeeper Distributed Coordinator Library
28 Stanford CoreNLP Library Library
29 Apache Tez Data Processing Library
... ... ... ...
50 Redis Distributed Cache Database
51 JSON File Format Database
52 XML File Format File Format
53 MongoDB Database Library
54 On-Premise Cloud Provider Library
55 MicroStrategy BI Library
56 Knime Workflow Library
57 Oracle Database Library
58 MySQL Database Library
59 Spark ML/MLlib Library Library
60 Spark Streaming Library Library
61 Spark SQL Library Library
62 CSV File Format Library
63 Deep Learning 4J Library Library
64 Redshift Database Library
65 Kinesis Library Library
66 DynamoDB Database Database
67 Spark GraphX Library Library
68 SQL Server Database Library
69 Elastic MapReduce Data Processing Library
70 Dato GraphLab Create Library Library
71 Memcached Distributed Cache Database
72 Neo4j Library Library
73 Postgres Database Library
74 Protobuffers File Format Library
75 S3 File System Library
76 Tensor Flow Data Processing Library
77 Titan GraphDB Database Database
78 Teradata Database Library
79 Vertica Database Database

80 rows × 3 columns


In [19]:
# Save & load the Random Forest model
!rm -rf /tmp/spark/2.0.0/rf

pipelineModelPersistPath = "/tmp/spark/2.0.0/rf"

pipelineModel.save(pipelineModelPersistPath)

In [20]:
from pyspark.ml import PipelineModel

restoredPipelineModel = PipelineModel.load(pipelineModelPersistPath)

In [21]:
featureImportances = restoredPipelineModel.stages[5].featureImportances

featureImportances


Out[21]:
SparseVector(262144, {14879: 0.0361, 21247: 0.0148, 26866: 0.016, 28190: 0.0115, 41989: 0.0161, 48935: 0.0812, 53718: 0.0338, 54750: 0.0284, 57388: 0.0495, 62736: 0.0833, 64841: 0.0254, 72667: 0.037, 76764: 0.0833, 83995: 0.0248, 85799: 0.011, 92854: 0.0191, 95547: 0.018, 104821: 0.0461, 139229: 0.0463, 148562: 0.0227, 156693: 0.0202, 164785: 0.0085, 206496: 0.0289, 219731: 0.0133, 226292: 0.0833, 227860: 0.0431, 239924: 0.0545, 242046: 0.0261, 252594: 0.0176})

In [22]:
restoredPipelineModel.stages[5].treeWeights


Out[22]:
[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]

In [23]:
restoredPredictionsDF = restoredPipelineModel.transform(itemsDF)

restoredPredictionsDF.select("category", "prediction", "probability") \
  .toPandas()


Out[23]:
category prediction probability
0 Database 1.0 [0.14219656567, 0.251893973845, 0.129918564992...
1 Distributed Cache 0.0 [0.159849660798, 0.14040280374, 0.149576735468...
2 Cluster Provision 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
3 Container 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
4 Cloud Provider 1.0 [0.158599156562, 0.178679326532, 0.14811445229...
5 Data Processing 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
6 Data Processing 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
7 Library 0.0 [0.206260863213, 0.142993781436, 0.14992890719...
8 Library 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
9 File System 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
10 Cluster Resource Manager 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
11 Database 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
12 Data Processing 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
13 Data Processing 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
14 Distribution 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
15 UI 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
16 Data Processing 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
17 Message Broker 0.0 [0.16319750491, 0.139583981186, 0.145130218055...
18 Library 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
19 Search Engine 0.0 [0.167296967276, 0.160327708785, 0.14076193848...
20 Search Engine 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
21 Library 0.0 [0.194675531444, 0.138910266427, 0.14733792949...
22 Data Processing 2.0 [0.157574745054, 0.167966597674, 0.17409527181...
23 Cluster Resource Manager 0.0 [0.168753060466, 0.131945092297, 0.1395746625,...
24 File Format 1.0 [0.157574745054, 0.192966597674, 0.14909527181...
25 File Format 0.0 [0.158624653757, 0.141064589153, 0.14516952623...
26 Data Processing 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
27 Distributed Coordinator 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
28 Library 0.0 [0.211904769868, 0.137842528195, 0.15016226933...
29 Data Processing 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
... ... ... ...
50 Distributed Cache 1.0 [0.152905216353, 0.157763914851, 0.14124340213...
51 File Format 1.0 [0.154397753667, 0.181271377538, 0.13900459616...
52 File Format 3.0 [0.152905216353, 0.132763914851, 0.14124340213...
53 Database 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
54 Cloud Provider 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
55 BI 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
56 Workflow 0.0 [0.150190918818, 0.132630854213, 0.13432615273...
57 Database 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
58 Database 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
59 Library 0.0 [0.194675531444, 0.138910266427, 0.14733792949...
60 Library 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
61 Library 0.0 [0.21319750491, 0.139583981186, 0.145130218055...
62 File Format 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
63 Library 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
64 Database 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
65 Library 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
66 Database 1.0 [0.150858327143, 0.208265105137, 0.16588631659...
67 Library 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
68 Database 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
69 Data Processing 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
70 Library 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
71 Distributed Cache 1.0 [0.161342198111, 0.188910266427, 0.14733792949...
72 Library 0.0 [0.174241411721, 0.142966597674, 0.14909527181...
73 Database 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
74 File Format 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
75 File System 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
76 Data Processing 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
77 Database 1.0 [0.16319750491, 0.189583981186, 0.145130218055...
78 Database 0.0 [0.168058616022, 0.148611758964, 0.15554688472...
79 Database 1.0 [0.158599156562, 0.178679326532, 0.14811445229...

80 rows × 3 columns


In [ ]: