notebook.community

Edit and run



In [21]:

    
import os
print os.environ['SPARK_HOME']
src = "file:///"+os.environ['SPARK_HOME']+"\README.md"

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *

sc = SparkContext( 'spark://headnodehost:7077', 'pyspark')

sqlContext = SQLContext(sc)









    



C:\apps\dist\spark-1.3.1.2.2.6.1-0012



In [22]:

    
lines = sc.textFile(src)
words = lines.flatMap(lambda x: x.split(" "))
word_count = (
  words.map(lambda x: (x, 1))
            .reduceByKey(lambda x, y: x+y))
word_count.collect()









    Out[22]:





[(u'', 75),
 (u'all', 1),
 (u'when', 1),
 (u'"local"', 1),
 (u'including', 3),
 (u'computation', 1),
 (u'Spark](#building-spark).', 1),
 (u'using:', 1),
 (u'guidance', 3),
 (u'Scala,', 1),
 (u'environment', 1),
 (u'only', 1),
 (u'rich', 1),
 (u'Apache', 1),
 (u'sc.parallelize(range(1000)).count()', 1),
 (u'Building', 1),
 (u'guide,', 1),
 (u'return', 2),
 (u'Please', 3),
 (u'Try', 1),
 (u'not', 1),
 (u'Spark', 14),
 (u'scala>', 1),
 (u'Note', 1),
 (u'cluster.', 1),
 (u'./bin/pyspark', 1),
 (u'have', 1),
 (u'params', 1),
 (u'through', 1),
 (u'GraphX', 1),
 (u'[run', 1),
 (u'abbreviated', 1),
 (u'[project', 2),
 (u'##', 8),
 (u'library', 1),
 (u'see', 1),
 (u'[Apache', 1),
 (u'will', 1),
 (u'#', 1),
 (u'processing,', 2),
 (u'for', 11),
 (u'[building', 1),
 (u'provides', 1),
 (u'print', 1),
 (u'supports', 2),
 (u'built,', 1),
 (u'[params]`.', 1),
 (u'available', 1),
 (u'run', 7),
 (u'This', 2),
 (u'Hadoop,', 2),
 (u'Tests', 1),
 (u'example:', 1),
 (u'-DskipTests', 1),
 (u'Maven](http://maven.apache.org/).', 1),
 (u'programming', 1),
 (u'running', 1),
 (u'against', 1),
 (u'site,', 1),
 (u'comes', 1),
 (u'package.', 1),
 (u'and', 10),
 (u'package.)', 1),
 (u'prefer', 1),
 (u'documentation,', 1),
 (u'submit', 1),
 (u'tools', 1),
 (u'use', 3),
 (u'from', 1),
 (u'For', 2),
 (u'fast', 1),
 (u'systems.', 1),
 (u'Version"](http://spark.apache.org/docs/latest/building-with-maven.html#specifying-the-hadoop-version)',
  1),
 (u'<http://spark.apache.org/>', 1),
 (u'Hadoop-supported', 1),
 (u'way', 1),
 (u'README', 1),
 (u'MASTER', 1),
 (u'engine', 1),
 (u'building', 3),
 (u'usage', 1),
 (u'Distributions"](http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html)',
  1),
 (u'instance:', 1),
 (u'with', 4),
 (u'protocols', 1),
 (u'And', 1),
 (u'this', 1),
 (u'setup', 1),
 (u'shell:', 2),
 (u'project', 1),
 (u'See', 1),
 (u'following', 2),
 (u'distribution', 1),
 (u'detailed', 2),
 (u'file', 1),
 (u'stream', 1),
 (u'is', 6),
 (u'higher-level', 1),
 (u'tests', 1),
 (u'1000:', 2),
 (u'sample', 1),
 (u'["Specifying', 1),
 (u'Alternatively,', 1),
 (u'./bin/run-example', 2),
 (u'need', 1),
 (u'You', 3),
 (u'instructions.', 1),
 (u'different', 1),
 (u'programs,', 1),
 (u'storage', 1),
 (u'same', 1),
 (u'machine', 1),
 (u'Running', 1),
 (u'which', 2),
 (u'you', 4),
 (u'A', 1),
 (u'About', 1),
 (u'sc.parallelize(1', 1),
 (u'locally.', 1),
 (u'Hive', 2),
 (u'optimized', 1),
 (u'uses', 1),
 (u'variable', 1),
 (u'The', 1),
 (u'data', 2),
 (u'a', 9),
 (u'Thriftserver', 1),
 (u'processing.', 1),
 (u'./bin/spark-shell', 1),
 (u'Python', 2),
 (u'mvn', 1),
 (u'clean', 1),
 (u'the', 21),
 (u'requires', 1),
 (u'talk', 1),
 (u'help', 1),
 (u'automated', 1),
 (u'Hadoop', 4),
 (u'using', 2),
 (u'high-level', 1),
 (u'find', 1),
 (u'web', 1),
 (u'Shell', 2),
 (u'how', 2),
 (u'graph', 1),
 (u'run:', 1),
 (u'should', 2),
 (u'to', 14),
 (u'given.', 1),
 (u'directory.', 1),
 (u'must', 1),
 (u'do', 2),
 (u'Programs', 1),
 (u'Many', 1),
 (u'"yarn-client"', 1),
 (u'YARN,', 1),
 (u'["Third', 1),
 (u'Example', 1),
 (u'Once', 1),
 (u'Spark"](http://spark.apache.org/docs/latest/building-spark.html).', 1),
 (u'Because', 1),
 (u'name', 1),
 (u'Testing', 1),
 (u'refer', 2),
 (u'Streaming', 1),
 (u'SQL', 2),
 (u'them,', 1),
 (u'analysis.', 1),
 (u'application', 1),
 (u'set', 2),
 (u'Scala', 2),
 (u'thread,', 1),
 (u'examples', 2),
 (u'runs.', 1),
 (u'Pi', 1),
 (u'More', 1),
 (u'Python,', 2),
 (u'Versions', 1),
 (u'its', 1),
 (u'version', 1),
 (u'wiki](https://cwiki.apache.org/confluence/display/SPARK).', 1),
 (u'`./bin/run-example', 1),
 (u'Configuration', 1),
 (u'command,', 2),
 (u'<class>', 1),
 (u'core', 1),
 (u'MASTER=spark://host:7077', 1),
 (u'Documentation', 1),
 (u'downloaded', 1),
 (u'distributions.', 1),
 (u'Spark.', 1),
 (u'["Building', 1),
 (u'`examples`', 2),
 (u'on', 6),
 (u'works', 1),
 (u'package', 1),
 (u'of', 5),
 (u'changed', 1),
 (u'pre-built', 1),
 (u'Big', 1),
 (u'"yarn-cluster"', 1),
 (u'or', 3),
 (u'learning,', 1),
 (u'structured', 1),
 (u'overview', 1),
 (u'one', 2),
 (u'tests](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-AutomatedTesting).',
  1),
 (u'(You', 1),
 (u'Online', 1),
 (u'versions', 1),
 (u'your', 1),
 (u'threads.', 1),
 (u'APIs', 1),
 (u'SparkPi', 2),
 (u'contains', 1),
 (u'system', 1),
 (u'class', 2),
 (u'start', 1),
 (u'basic', 1),
 (u'configure', 1),
 (u'that', 3),
 (u'N', 1),
 (u'guide](http://spark.apache.org/docs/latest/configuration.html)', 1),
 (u'>>>', 1),
 (u'particular', 3),
 (u'be', 2),
 (u'an', 3),
 (u'easiest', 1),
 (u'Interactive', 2),
 (u'cluster', 2),
 (u'programs', 2),
 (u'can', 6),
 (u'locally', 2),
 (u'example', 3),
 (u'are', 1),
 (u'Data.', 1),
 (u'mesos://', 1),
 (u'computing', 1),
 (u'URL,', 1),
 (u'in', 5),
 (u'general', 2),
 (u'To', 2),
 (u'at', 2),
 (u'1000).count()', 1),
 (u'Party', 1),
 (u'if', 4),
 (u'built', 1),
 (u'no', 1),
 (u'Java,', 1),
 (u'"local[N]"', 1),
 (u'MLlib', 1),
 (u'also', 5),
 (u'other', 1),
 (u'build', 3),
 (u'online', 1),
 (u'several', 1),
 (u'distribution.', 1),
 (u'HDFS', 1),
 (u'[Configuration', 1),
 (u'spark://', 1),
 (u'page](http://spark.apache.org/documentation.html)', 1),
 (u'documentation', 3),
 (u'It', 2),
 (u'graphs', 1),
 (u'./dev/run-tests', 1),
 (u'first', 1),
 (u'latest', 1)]



In [23]:

    
linesWithSpark = lines.filter(lambda x: "spark" in x.lower())
print linesWithSpark.count()



In [28]:

    
import urllib2
data = urllib2.urlopen('https://raw.githubusercontent.com/databricks/learning-spark/master/files/testweet.json').read()
print data
rdd = sc.parallelize([data])
path = "wasb://asadk-sparkml@asdfasdf782x8m60.blob.core.windows.net/magic2/"
rdd.saveAsTextFile("wasb://asadk-sparkml@asdfasdf782x8m60.blob.core.windows.net/magic2/")









    



{"createdAt":"Nov 4, 2014 4:56:59 PM","id":529799371026485248,"text":"Adventures With Coffee, Code, and Writing.","source":"\u003ca href\u003d\"http://twitter.com\" rel\u003d\"nofollow\"\u003eTwitter Web Client\u003c/a\u003e","isTruncated":false,"inReplyToStatusId":-1,"inReplyToUserId":-1,"isFavorited":false,"retweetCount":0,"isPossiblySensitive":false,"contributorsIDs":[],"userMentionEntities":[],"urlEntities":[],"hashtagEntities":[],"mediaEntities":[],"currentUserRetweetId":-1,"user":{"id":15594928,"name":"Holden Karau","screenName":"holdenkarau","location":"","description":"","descriptionURLEntities":[],"isContributorsEnabled":false,"profileImageUrl":"http://pbs.twimg.com/profile_images/3005696115/2036374bbadbed85249cdd50aac6e170_normal.jpeg","profileImageUrlHttps":"https://pbs.twimg.com/profile_images/3005696115/2036374bbadbed85249cdd50aac6e170_normal.jpeg","isProtected":false,"followersCount":1231,"profileBackgroundColor":"C0DEED","profileTextColor":"333333","profileLinkColor":"0084B4","profileSidebarFillColor":"DDEEF6","profileSidebarBorderColor":"FFFFFF","profileUseBackgroundImage":true,"showAllInlineMedia":false,"friendsCount":600,"createdAt":"Aug 5, 2011 9:42:44 AM","favouritesCount":1095,"utcOffset":-3,"profileBackgroundImageUrl":"","profileBackgroundImageUrlHttps":"","profileBannerImageUrl":"","profileBackgroundTiled":true,"lang":"en","statusesCount":6234,"isGeoEnabled":true,"isVerified":false,"translator":false,"listedCount":0,"isFollowRequestSent":false}}



In [31]:

    
df = sqlContext.jsonFile(path)
df.printSchema()









    



root
 |-- _corrupt_record: string (nullable = true)
 |-- contributorsIDs: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- createdAt: string (nullable = true)
 |-- currentUserRetweetId: long (nullable = true)
 |-- hashtagEntities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: long (nullable = true)
 |-- inReplyToStatusId: long (nullable = true)
 |-- inReplyToUserId: long (nullable = true)
 |-- isFavorited: boolean (nullable = true)
 |-- isPossiblySensitive: boolean (nullable = true)
 |-- isTruncated: boolean (nullable = true)
 |-- mediaEntities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- retweetCount: long (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)
 |-- urlEntities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- user: struct (nullable = true)
 |    |-- createdAt: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- descriptionURLEntities: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- favouritesCount: long (nullable = true)
 |    |-- followersCount: long (nullable = true)
 |    |-- friendsCount: long (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- isContributorsEnabled: boolean (nullable = true)
 |    |-- isFollowRequestSent: boolean (nullable = true)
 |    |-- isGeoEnabled: boolean (nullable = true)
 |    |-- isProtected: boolean (nullable = true)
 |    |-- isVerified: boolean (nullable = true)
 |    |-- lang: string (nullable = true)
 |    |-- listedCount: long (nullable = true)
 |    |-- location: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- profileBackgroundColor: string (nullable = true)
 |    |-- profileBackgroundImageUrl: string (nullable = true)
 |    |-- profileBackgroundImageUrlHttps: string (nullable = true)
 |    |-- profileBackgroundTiled: boolean (nullable = true)
 |    |-- profileBannerImageUrl: string (nullable = true)
 |    |-- profileImageUrl: string (nullable = true)
 |    |-- profileImageUrlHttps: string (nullable = true)
 |    |-- profileLinkColor: string (nullable = true)
 |    |-- profileSidebarBorderColor: string (nullable = true)
 |    |-- profileSidebarFillColor: string (nullable = true)
 |    |-- profileTextColor: string (nullable = true)
 |    |-- profileUseBackgroundImage: boolean (nullable = true)
 |    |-- screenName: string (nullable = true)
 |    |-- showAllInlineMedia: boolean (nullable = true)
 |    |-- statusesCount: long (nullable = true)
 |    |-- translator: boolean (nullable = true)
 |    |-- utcOffset: long (nullable = true)
 |-- userMentionEntities: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [53]:

    
# Example from Spark Docs @ http://spark.apache.org/docs/1.3.1/mllib-linear-methods.html
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from numpy import array

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("file:///"+os.environ['SPARK_HOME']+"/data/mllib/sample_svm_data.txt")
parsedData = data.map(parsePoint)

# Build the model
model = LogisticRegressionWithSGD.train(parsedData)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))









    



Training Error = 0.363354037267



In [18]:

    
sc.stop()