In [ ]:
def parseRaw(json_map):
url = json_map['url']
content = json_map['html']
return (url,content)
In [ ]:
## getContent: for input aritcle, get it own word set via jieba.cut()
def getContent(x):
from bs4 import BeautifulSoup
soup = BeautifulSoup(x)
text = soup.getText().replace('\n','').replace('\r','').replace(' ','').replace('\t','')
import jieba
r = list()
for term in jieba.cut(text):
if len(term) > 1 and checkword(term): r.append(term)
return r
def checkword(x):
return all(u'\u4e00' <= c <= u'\u9fff' for c in x)
In [ ]:
import json
travel_content = sc.textFile("./pixnet.txt").map(json.loads).map(parseRaw)
makeup_content = sc.textFile("./makeup.txt").map(json.loads).map(parseRaw)
In [ ]:
## get terms for all of training data
tr_terms = travel_content.map(lambda x : getContent(x[1])).flatMap(lambda x : x)
mk_terms = makeup_content.map(lambda x : getContent(x[1])).flatMap(lambda x : x)
all_terms = tr_terms.union(mk_terms).distinct().collect()
all_terms_map = dict()
index = 0
for i in all_terms:
all_terms_map[i] = index
index+=1
## all_terms_map is the mapping of (term : index) for the whole training set
all_terms_map
In [ ]:
from pyspark.mllib.linalg import Vectors ,SparseVector
from pyspark.mllib.regression import LabeledPoint
def mapFeature(terms):
fs = dict()
for term in terms:
if term not in all_terms_map : continue
index = all_terms_map[term]
if index not in fs:
fs[index] = 1
else :
fs[index] += 1
return fs
def buildFeature(label,terms):
fs = mapFeature(terms)
vec = SparseVector(len(all_terms_map),fs)
return LabeledPoint(label, vec)
## Produce LabeledPoint for both training data
tr_fs = travel_content.map(lambda x : buildFeature(0, getContent(x[1])))
mk_fs = makeup_content.map(lambda x : buildFeature(1, getContent(x[1])))
In [ ]:
all_fs = tr_fs.union(mk_fs)
## Verify total article number
all_fs.count()
In [ ]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
model = NaiveBayes.train(all_fs, 1.0)
In [ ]:
import jieba
doc = jieba.cut("我想要去馬來西亞來去旅遊")
## Create SparseVector type testing data
f = SparseVector(len(all_terms_map),mapFeature(doc))
## Make a prediction
if model.predict(f) ==1 :
print "這是美妝類"
else :
print "這是旅遊類"
In [ ]:
doc = list(jieba.cut("我想要買化妝品,且變漂亮"))
f = SparseVector(len(all_terms_map),mapFeature(doc))
if model.predict(f) ==1 :
print "這是美妝類"
else :
print "這是旅遊類"
In [ ]: