In [1]:
from pyspark import SparkContext, SparkConf, SQLContext, HiveContext, StorageLevel
from pyspark.sql.functions import *
sc = SparkContext()

data = sc.textFile('../../nltk_data/corpora/state_union/1972-Nixon.txt')

import nltk

words = data.flatMap(lambda x: nltk.word_tokenize(x))
print words.take(10)

pos_word = words.map(lambda x: nltk.pos_tag([x]))


[u'Address', u'on', u'the', u'State', u'of', u'the', u'Union', u'Delivered', u'Before', u'a']
[[(u'Address', 'NN')], [(u'on', 'IN')], [(u'the', 'DT')], [(u'State', 'NN')], [(u'of', 'IN')]]

In [2]:
print pos_word.take(50)


[[(u'Address', 'NN')], [(u'on', 'IN')], [(u'the', 'DT')], [(u'State', 'NN')], [(u'of', 'IN')], [(u'the', 'DT')], [(u'Union', 'NNP')], [(u'Delivered', 'VBN')], [(u'Before', 'IN')], [(u'a', 'DT')], [(u'Joint', 'JJ')], [(u'Session', 'NN')], [(u'of', 'IN')], [(u'the', 'DT')], [(u'Congress', 'NNP')], [(u'.', '.')], [(u'January', 'NNP')], [(u'20', 'CD')], [(u',', ',')], [(u'1972', 'CD')], [(u'Mr.', 'NNP')], [(u'Speaker', 'NN')], [(u',', ',')], [(u'Mr.', 'NNP')], [(u'President', 'NNP')], [(u',', ',')], [(u'my', 'PRP$')], [(u'colleagues', 'NNS')], [(u'in', 'IN')], [(u'the', 'DT')], [(u'Congress', 'NNP')], [(u',', ',')], [(u'our', 'PRP$')], [(u'distinguished', 'VBN')], [(u'guests', 'NNS')], [(u',', ',')], [(u'my', 'PRP$')], [(u'fellow', 'NN')], [(u'Americans', 'NNS')], [(u':', ':')], [(u'\ufffd', 'NN')], [(u'@', 'NN')], [(u'Twenty-five', 'JJ')], [(u'years', 'NNS')], [(u'ago', 'RB')], [(u'I', 'PRP')], [(u'sat', 'NN')], [(u'here', 'RB')], [(u'as', 'IN')], [(u'a', 'DT')]]

In [ ]: