notebook.community

Edit and run



In [1]:

    
%matplotlib inline

# Enable Retina mode for higher-res on Macbook's with Retina Display
%config InlineBackend.figure_format = 'retina'



In [2]:

    
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt

import seaborn as sns
from pyspark.sql import SQLContext
from pyspark.sql.types import *



In [3]:

    
itemsDF = sqlContext.read.format("json") \
            .load("file:/root/pipeline/myapps/html/advancedspark.com/json/software.json") \
            .select("id", "title", "category", "description")



In [4]:

    
import pyspark.sql.functions as func

categoriesDF = itemsDF.select("category") \
                 .groupBy("category") \
                 .count() \
                 .orderBy("count", ascending=False) \
                 .filter("count > 10") \
                 .toPandas()

categoriesDF









    Out[4]:






  
    
      
      category
      count
    
  
  
    
      0
      Library
      15
    
    
      1
      Database
      12
    
    
      2
      Data Processing
      11



In [5]:

    
sns.set_style("whitegrid")
plot = sns.barplot(x="category", y="count", data=categoriesDF)



In [6]:

    
from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer(inputCol = "description", \
                           outputCol="words", \
                           gaps=False, \
                           pattern="\\p{L}+")
tokenizer









    Out[6]:





RegexTokenizer_4824b0ee97579ac11a71



In [7]:

    
from pyspark.ml.feature import StopWordsRemover

stopWordsFilter = StopWordsRemover(inputCol = "words", \
                                   outputCol = "filteredWords", \
                                   caseSensitive = False)
stopWordsFilter









    Out[7]:





StopWordsRemover_49e08ec5b8fe56255763



In [8]:

    
from pyspark.ml.feature import HashingTF

tf = HashingTF(inputCol = "filteredWords", \
               outputCol = "tfFeatures")
                                   
tf









    Out[8]:





HashingTF_4eb9936b910c2f5adb58



In [9]:

    
from pyspark.ml.feature import IDF

idf = IDF(inputCol = "tfFeatures", \
          outputCol = "idfFeatures")
                                   
idf









    Out[9]:





IDF_47eab0caf47f6c97534a



In [10]:

    
from pyspark.ml.feature import StringIndexer

categoryIndexer = StringIndexer(inputCol = "category", \
                                outputCol = "indexedCategory") 

categoryIndexerModel = categoryIndexer.fit(itemsDF)

categoryIndexerModel









    Out[10]:





StringIndexer_45f4a10a18894adcf517



In [11]:

    
from pyspark.ml.classification import RandomForestClassifier

classifier = RandomForestClassifier(featuresCol = "idfFeatures", \
                                    labelCol = "indexedCategory", \
                                    predictionCol = "prediction", \
                                    rawPredictionCol = "confidence", \
                                    probabilityCol = "probability")
classifier









    Out[11]:





RandomForestClassifier_41cc86e845efe481d6a6



In [12]:

    
from pyspark.ml.feature import IndexToString

categoryReverseIndexer = IndexToString(inputCol = "prediction", \
                                       outputCol = "predictedCategory", \
                                       labels = categoryIndexerModel.labels)

categoryReverseIndexer









    Out[12]:





IndexToString_45b0a3eb14e16e443300



In [13]:

    
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = [tokenizer, stopWordsFilter, tf, idf, categoryIndexer, classifier, categoryReverseIndexer])

pipeline









    Out[13]:





Pipeline_496bba6ea793a2c9ad35



In [14]:

    
# TODO:  Implement Cross Validation and Grid Search



In [15]:

    
pipelineModel = pipeline.fit(itemsDF)



In [16]:

    
predictionsDF = pipelineModel.transform(itemsDF)

predictionsDF.select("title", "category", "predictedCategory").toPandas()









    Out[16]:






  
    
      
      title
      category
      predictedCategory
    
  
  
    
      0
      Apache Cassandra
      Database
      Database
    
    
      1
      Tachyon
      Distributed Cache
      Library
    
    
      2
      Apache Ambari
      Cluster Provision
      Library
    
    
      3
      Docker
      Container
      Library
    
    
      4
      Microsft Azure
      Cloud Provider
      Database
    
    
      5
      Apache Flink
      Data Processing
      Library
    
    
      6
      Apache Spark
      Data Processing
      Library
    
    
      7
      Apache Flume
      Library
      Library
    
    
      8
      Apache Giraph
      Library
      Library
    
    
      9
      Apache HDFS
      File System
      Library
    
    
      10
      Apache YARN
      Cluster Resource Manager
      Library
    
    
      11
      Apache HBase
      Database
      Library
    
    
      12
      Apache MapReduce
      Data Processing
      Library
    
    
      13
      Apache Hive
      Data Processing
      Library
    
    
      14
      Hortonworks
      Distribution
      Library
    
    
      15
      Apache HUE
      UI
      Library
    
    
      16
      Apache Impala
      Data Processing
      Library
    
    
      17
      Apache Kafka
      Message Broker
      Library
    
    
      18
      Apache Lucene
      Library
      Library
    
    
      19
      Apache Solr
      Search Engine
      Library
    
    
      20
      ElasticSearch
      Search Engine
      Library
    
    
      21
      Apache Mahout
      Library
      Library
    
    
      22
      Apache Drill
      Data Processing
      Data Processing
    
    
      23
      Apache Mesos
      Cluster Resource Manager
      Library
    
    
      24
      Apache Parquet
      File Format
      Database
    
    
      25
      Apache ORC
      File Format
      Library
    
    
      26
      Apache Pig
      Data Processing
      Library
    
    
      27
      Apache ZooKeeper
      Distributed Coordinator
      Library
    
    
      28
      Stanford CoreNLP
      Library
      Library
    
    
      29
      Apache Tez
      Data Processing
      Library
    
    
      ...
      ...
      ...
      ...
    
    
      50
      Redis
      Distributed Cache
      Database
    
    
      51
      JSON
      File Format
      Database
    
    
      52
      XML
      File Format
      File Format
    
    
      53
      MongoDB
      Database
      Library
    
    
      54
      On-Premise
      Cloud Provider
      Library
    
    
      55
      MicroStrategy
      BI
      Library
    
    
      56
      Knime
      Workflow
      Library
    
    
      57
      Oracle
      Database
      Library
    
    
      58
      MySQL
      Database
      Library
    
    
      59
      Spark ML/MLlib
      Library
      Library
    
    
      60
      Spark Streaming
      Library
      Library
    
    
      61
      Spark SQL
      Library
      Library
    
    
      62
      CSV
      File Format
      Library
    
    
      63
      Deep Learning 4J
      Library
      Library
    
    
      64
      Redshift
      Database
      Library
    
    
      65
      Kinesis
      Library
      Library
    
    
      66
      DynamoDB
      Database
      Database
    
    
      67
      Spark GraphX
      Library
      Library
    
    
      68
      SQL Server
      Database
      Library
    
    
      69
      Elastic MapReduce
      Data Processing
      Library
    
    
      70
      Dato GraphLab Create
      Library
      Library
    
    
      71
      Memcached
      Distributed Cache
      Database
    
    
      72
      Neo4j
      Library
      Library
    
    
      73
      Postgres
      Database
      Library
    
    
      74
      Protobuffers
      File Format
      Library
    
    
      75
      S3
      File System
      Library
    
    
      76
      Tensor Flow
      Data Processing
      Library
    
    
      77
      Titan GraphDB
      Database
      Database
    
    
      78
      Teradata
      Database
      Library
    
    
      79
      Vertica
      Database
      Database
    
  

80 rows × 3 columns



In [19]:

    
# Save & load the Random Forest model
!rm -rf /tmp/spark/2.0.0/rf

pipelineModelPersistPath = "/tmp/spark/2.0.0/rf"

pipelineModel.save(pipelineModelPersistPath)



In [20]:

    
from pyspark.ml import PipelineModel

restoredPipelineModel = PipelineModel.load(pipelineModelPersistPath)



In [21]:

    
featureImportances = restoredPipelineModel.stages[5].featureImportances

featureImportances









    Out[21]:





SparseVector(262144, {14879: 0.0361, 21247: 0.0148, 26866: 0.016, 28190: 0.0115, 41989: 0.0161, 48935: 0.0812, 53718: 0.0338, 54750: 0.0284, 57388: 0.0495, 62736: 0.0833, 64841: 0.0254, 72667: 0.037, 76764: 0.0833, 83995: 0.0248, 85799: 0.011, 92854: 0.0191, 95547: 0.018, 104821: 0.0461, 139229: 0.0463, 148562: 0.0227, 156693: 0.0202, 164785: 0.0085, 206496: 0.0289, 219731: 0.0133, 226292: 0.0833, 227860: 0.0431, 239924: 0.0545, 242046: 0.0261, 252594: 0.0176})



In [22]:

    
restoredPipelineModel.stages[5].treeWeights









    Out[22]:





[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]



In [23]:

    
restoredPredictionsDF = restoredPipelineModel.transform(itemsDF)

restoredPredictionsDF.select("category", "prediction", "probability") \
  .toPandas()









    Out[23]:






  
    
      
      category
      prediction
      probability
    
  
  
    
      0
      Database
      1.0
      [0.14219656567, 0.251893973845, 0.129918564992...
    
    
      1
      Distributed Cache
      0.0
      [0.159849660798, 0.14040280374, 0.149576735468...
    
    
      2
      Cluster Provision
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      3
      Container
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      4
      Cloud Provider
      1.0
      [0.158599156562, 0.178679326532, 0.14811445229...
    
    
      5
      Data Processing
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      6
      Data Processing
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      7
      Library
      0.0
      [0.206260863213, 0.142993781436, 0.14992890719...
    
    
      8
      Library
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      9
      File System
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      10
      Cluster Resource Manager
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      11
      Database
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      12
      Data Processing
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      13
      Data Processing
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      14
      Distribution
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      15
      UI
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      16
      Data Processing
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      17
      Message Broker
      0.0
      [0.16319750491, 0.139583981186, 0.145130218055...
    
    
      18
      Library
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      19
      Search Engine
      0.0
      [0.167296967276, 0.160327708785, 0.14076193848...
    
    
      20
      Search Engine
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      21
      Library
      0.0
      [0.194675531444, 0.138910266427, 0.14733792949...
    
    
      22
      Data Processing
      2.0
      [0.157574745054, 0.167966597674, 0.17409527181...
    
    
      23
      Cluster Resource Manager
      0.0
      [0.168753060466, 0.131945092297, 0.1395746625,...
    
    
      24
      File Format
      1.0
      [0.157574745054, 0.192966597674, 0.14909527181...
    
    
      25
      File Format
      0.0
      [0.158624653757, 0.141064589153, 0.14516952623...
    
    
      26
      Data Processing
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      27
      Distributed Coordinator
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      28
      Library
      0.0
      [0.211904769868, 0.137842528195, 0.15016226933...
    
    
      29
      Data Processing
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      ...
      ...
      ...
      ...
    
    
      50
      Distributed Cache
      1.0
      [0.152905216353, 0.157763914851, 0.14124340213...
    
    
      51
      File Format
      1.0
      [0.154397753667, 0.181271377538, 0.13900459616...
    
    
      52
      File Format
      3.0
      [0.152905216353, 0.132763914851, 0.14124340213...
    
    
      53
      Database
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      54
      Cloud Provider
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      55
      BI
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      56
      Workflow
      0.0
      [0.150190918818, 0.132630854213, 0.13432615273...
    
    
      57
      Database
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      58
      Database
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      59
      Library
      0.0
      [0.194675531444, 0.138910266427, 0.14733792949...
    
    
      60
      Library
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      61
      Library
      0.0
      [0.21319750491, 0.139583981186, 0.145130218055...
    
    
      62
      File Format
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      63
      Library
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      64
      Database
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      65
      Library
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      66
      Database
      1.0
      [0.150858327143, 0.208265105137, 0.16588631659...
    
    
      67
      Library
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      68
      Database
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      69
      Data Processing
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      70
      Library
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      71
      Distributed Cache
      1.0
      [0.161342198111, 0.188910266427, 0.14733792949...
    
    
      72
      Library
      0.0
      [0.174241411721, 0.142966597674, 0.14909527181...
    
    
      73
      Database
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      74
      File Format
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      75
      File System
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      76
      Data Processing
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      77
      Database
      1.0
      [0.16319750491, 0.189583981186, 0.145130218055...
    
    
      78
      Database
      0.0
      [0.168058616022, 0.148611758964, 0.15554688472...
    
    
      79
      Database
      1.0
      [0.158599156562, 0.178679326532, 0.14811445229...
    
  

80 rows × 3 columns



In [ ]:

	title	category	predictedCategory
0	Apache Cassandra	Database	Database
1	Tachyon	Distributed Cache	Library
2	Apache Ambari	Cluster Provision	Library
3	Docker	Container	Library
4	Microsft Azure	Cloud Provider	Database
5	Apache Flink	Data Processing	Library
6	Apache Spark	Data Processing	Library
7	Apache Flume	Library	Library
8	Apache Giraph	Library	Library
9	Apache HDFS	File System	Library
10	Apache YARN	Cluster Resource Manager	Library
11	Apache HBase	Database	Library
12	Apache MapReduce	Data Processing	Library
13	Apache Hive	Data Processing	Library
14	Hortonworks	Distribution	Library
15	Apache HUE	UI	Library
16	Apache Impala	Data Processing	Library
17	Apache Kafka	Message Broker	Library
18	Apache Lucene	Library	Library
19	Apache Solr	Search Engine	Library
20	ElasticSearch	Search Engine	Library
21	Apache Mahout	Library	Library
22	Apache Drill	Data Processing	Data Processing
23	Apache Mesos	Cluster Resource Manager	Library
24	Apache Parquet	File Format	Database
25	Apache ORC	File Format	Library
26	Apache Pig	Data Processing	Library
27	Apache ZooKeeper	Distributed Coordinator	Library
28	Stanford CoreNLP	Library	Library
29	Apache Tez	Data Processing	Library
...	...	...	...
50	Redis	Distributed Cache	Database
51	JSON	File Format	Database
52	XML	File Format	File Format
53	MongoDB	Database	Library
54	On-Premise	Cloud Provider	Library
55	MicroStrategy	BI	Library
56	Knime	Workflow	Library
57	Oracle	Database	Library
58	MySQL	Database	Library
59	Spark ML/MLlib	Library	Library
60	Spark Streaming	Library	Library
61	Spark SQL	Library	Library
62	CSV	File Format	Library
63	Deep Learning 4J	Library	Library
64	Redshift	Database	Library
65	Kinesis	Library	Library
66	DynamoDB	Database	Database
67	Spark GraphX	Library	Library
68	SQL Server	Database	Library
69	Elastic MapReduce	Data Processing	Library
70	Dato GraphLab Create	Library	Library
71	Memcached	Distributed Cache	Database
72	Neo4j	Library	Library
73	Postgres	Database	Library
74	Protobuffers	File Format	Library
75	S3	File System	Library
76	Tensor Flow	Data Processing	Library
77	Titan GraphDB	Database	Database
78	Teradata	Database	Library
79	Vertica	Database	Database

	category	prediction	probability
0	Database	1.0	[0.14219656567, 0.251893973845, 0.129918564992...
1	Distributed Cache	0.0	[0.159849660798, 0.14040280374, 0.149576735468...
2	Cluster Provision	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
3	Container	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
4	Cloud Provider	1.0	[0.158599156562, 0.178679326532, 0.14811445229...
5	Data Processing	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
6	Data Processing	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
7	Library	0.0	[0.206260863213, 0.142993781436, 0.14992890719...
8	Library	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
9	File System	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
10	Cluster Resource Manager	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
11	Database	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
12	Data Processing	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
13	Data Processing	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
14	Distribution	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
15	UI	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
16	Data Processing	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
17	Message Broker	0.0	[0.16319750491, 0.139583981186, 0.145130218055...
18	Library	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
19	Search Engine	0.0	[0.167296967276, 0.160327708785, 0.14076193848...
20	Search Engine	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
21	Library	0.0	[0.194675531444, 0.138910266427, 0.14733792949...
22	Data Processing	2.0	[0.157574745054, 0.167966597674, 0.17409527181...
23	Cluster Resource Manager	0.0	[0.168753060466, 0.131945092297, 0.1395746625,...
24	File Format	1.0	[0.157574745054, 0.192966597674, 0.14909527181...
25	File Format	0.0	[0.158624653757, 0.141064589153, 0.14516952623...
26	Data Processing	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
27	Distributed Coordinator	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
28	Library	0.0	[0.211904769868, 0.137842528195, 0.15016226933...
29	Data Processing	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
...	...	...	...
50	Distributed Cache	1.0	[0.152905216353, 0.157763914851, 0.14124340213...
51	File Format	1.0	[0.154397753667, 0.181271377538, 0.13900459616...
52	File Format	3.0	[0.152905216353, 0.132763914851, 0.14124340213...
53	Database	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
54	Cloud Provider	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
55	BI	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
56	Workflow	0.0	[0.150190918818, 0.132630854213, 0.13432615273...
57	Database	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
58	Database	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
59	Library	0.0	[0.194675531444, 0.138910266427, 0.14733792949...
60	Library	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
61	Library	0.0	[0.21319750491, 0.139583981186, 0.145130218055...
62	File Format	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
63	Library	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
64	Database	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
65	Library	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
66	Database	1.0	[0.150858327143, 0.208265105137, 0.16588631659...
67	Library	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
68	Database	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
69	Data Processing	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
70	Library	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
71	Distributed Cache	1.0	[0.161342198111, 0.188910266427, 0.14733792949...
72	Library	0.0	[0.174241411721, 0.142966597674, 0.14909527181...
73	Database	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
74	File Format	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
75	File System	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
76	Data Processing	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
77	Database	1.0	[0.16319750491, 0.189583981186, 0.145130218055...
78	Database	0.0	[0.168058616022, 0.148611758964, 0.15554688472...
79	Database	1.0	[0.158599156562, 0.178679326532, 0.14811445229...