In [13]:
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
In [14]:
input = sqlCtx.jsonFile("../data/tweets.txt")
input.registerTempTable('tweets')
input.printSchema()
root
|-- _corrupt_record: string (nullable = true)
|-- contributors: string (nullable = true)
|-- coordinates: struct (nullable = true)
| |-- coordinates: array (nullable = true)
| | |-- element: double (containsNull = true)
| |-- type: string (nullable = true)
|-- created_at: string (nullable = true)
|-- delete: struct (nullable = true)
| |-- status: struct (nullable = true)
| | |-- id: long (nullable = true)
| | |-- id_str: string (nullable = true)
| | |-- user_id: long (nullable = true)
| | |-- user_id_str: string (nullable = true)
| |-- timestamp_ms: string (nullable = true)
|-- entities: struct (nullable = true)
| |-- hashtags: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- text: string (nullable = true)
| |-- media: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- display_url: string (nullable = true)
| | | |-- expanded_url: string (nullable = true)
| | | |-- id: long (nullable = true)
| | | |-- id_str: string (nullable = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- media_url: string (nullable = true)
| | | |-- media_url_https: string (nullable = true)
| | | |-- sizes: struct (nullable = true)
| | | | |-- large: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- medium: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- small: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- thumb: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | |-- source_status_id: long (nullable = true)
| | | |-- source_status_id_str: string (nullable = true)
| | | |-- type: string (nullable = true)
| | | |-- url: string (nullable = true)
| |-- symbols: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- trends: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- urls: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- display_url: string (nullable = true)
| | | |-- expanded_url: string (nullable = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- url: string (nullable = true)
| |-- user_mentions: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- id: long (nullable = true)
| | | |-- id_str: string (nullable = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- name: string (nullable = true)
| | | |-- screen_name: string (nullable = true)
|-- extended_entities: struct (nullable = true)
| |-- media: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- display_url: string (nullable = true)
| | | |-- expanded_url: string (nullable = true)
| | | |-- id: long (nullable = true)
| | | |-- id_str: string (nullable = true)
| | | |-- indices: array (nullable = true)
| | | | |-- element: long (containsNull = true)
| | | |-- media_url: string (nullable = true)
| | | |-- media_url_https: string (nullable = true)
| | | |-- sizes: struct (nullable = true)
| | | | |-- large: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- medium: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- small: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | | |-- thumb: struct (nullable = true)
| | | | | |-- h: long (nullable = true)
| | | | | |-- resize: string (nullable = true)
| | | | | |-- w: long (nullable = true)
| | | |-- source_status_id: long (nullable = true)
| | | |-- source_status_id_str: string (nullable = true)
| | | |-- type: string (nullable = true)
| | | |-- url: string (nullable = true)
|-- favorite_count: long (nullable = true)
|-- favorited: boolean (nullable = true)
|-- filter_level: string (nullable = true)
|-- geo: struct (nullable = true)
| |-- coordinates: array (nullable = true)
| | |-- element: double (containsNull = true)
| |-- type: string (nullable = true)
|-- id: long (nullable = true)
|-- id_str: string (nullable = true)
|-- in_reply_to_screen_name: string (nullable = true)
|-- in_reply_to_status_id: long (nullable = true)
|-- in_reply_to_status_id_str: string (nullable = true)
|-- in_reply_to_user_id: long (nullable = true)
|-- in_reply_to_user_id_str: string (nullable = true)
|-- lang: string (nullable = true)
|-- place: struct (nullable = true)
| |-- bounding_box: struct (nullable = true)
| | |-- coordinates: array (nullable = true)
| | | |-- element: array (containsNull = true)
| | | | |-- element: array (containsNull = true)
| | | | | |-- element: double (containsNull = true)
| | |-- type: string (nullable = true)
| |-- country: string (nullable = true)
| |-- country_code: string (nullable = true)
| |-- full_name: string (nullable = true)
| |-- id: string (nullable = true)
| |-- name: string (nullable = true)
| |-- place_type: string (nullable = true)
| |-- url: string (nullable = true)
|-- possibly_sensitive: boolean (nullable = true)
|-- retweet_count: long (nullable = true)
|-- retweeted: boolean (nullable = true)
|-- retweeted_status: struct (nullable = true)
| |-- contributors: string (nullable = true)
| |-- coordinates: struct (nullable = true)
| | |-- coordinates: array (nullable = true)
| | | |-- element: double (containsNull = true)
| | |-- type: string (nullable = true)
| |-- created_at: string (nullable = true)
| |-- entities: struct (nullable = true)
| | |-- hashtags: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- text: string (nullable = true)
| | |-- media: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- display_url: string (nullable = true)
| | | | |-- expanded_url: string (nullable = true)
| | | | |-- id: long (nullable = true)
| | | | |-- id_str: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- media_url: string (nullable = true)
| | | | |-- media_url_https: string (nullable = true)
| | | | |-- sizes: struct (nullable = true)
| | | | | |-- large: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- medium: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- small: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- thumb: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | |-- source_status_id: long (nullable = true)
| | | | |-- source_status_id_str: string (nullable = true)
| | | | |-- type: string (nullable = true)
| | | | |-- url: string (nullable = true)
| | |-- symbols: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- trends: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- urls: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- display_url: string (nullable = true)
| | | | |-- expanded_url: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- url: string (nullable = true)
| | |-- user_mentions: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- id: long (nullable = true)
| | | | |-- id_str: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- name: string (nullable = true)
| | | | |-- screen_name: string (nullable = true)
| |-- extended_entities: struct (nullable = true)
| | |-- media: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- display_url: string (nullable = true)
| | | | |-- expanded_url: string (nullable = true)
| | | | |-- id: long (nullable = true)
| | | | |-- id_str: string (nullable = true)
| | | | |-- indices: array (nullable = true)
| | | | | |-- element: long (containsNull = true)
| | | | |-- media_url: string (nullable = true)
| | | | |-- media_url_https: string (nullable = true)
| | | | |-- sizes: struct (nullable = true)
| | | | | |-- large: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- medium: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- small: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | | |-- thumb: struct (nullable = true)
| | | | | | |-- h: long (nullable = true)
| | | | | | |-- resize: string (nullable = true)
| | | | | | |-- w: long (nullable = true)
| | | | |-- source_status_id: long (nullable = true)
| | | | |-- source_status_id_str: string (nullable = true)
| | | | |-- type: string (nullable = true)
| | | | |-- url: string (nullable = true)
| |-- favorite_count: long (nullable = true)
| |-- favorited: boolean (nullable = true)
| |-- filter_level: string (nullable = true)
| |-- geo: struct (nullable = true)
| | |-- coordinates: array (nullable = true)
| | | |-- element: double (containsNull = true)
| | |-- type: string (nullable = true)
| |-- id: long (nullable = true)
| |-- id_str: string (nullable = true)
| |-- in_reply_to_screen_name: string (nullable = true)
| |-- in_reply_to_status_id: long (nullable = true)
| |-- in_reply_to_status_id_str: string (nullable = true)
| |-- in_reply_to_user_id: long (nullable = true)
| |-- in_reply_to_user_id_str: string (nullable = true)
| |-- lang: string (nullable = true)
| |-- place: struct (nullable = true)
| | |-- bounding_box: struct (nullable = true)
| | | |-- coordinates: array (nullable = true)
| | | | |-- element: array (containsNull = true)
| | | | | |-- element: array (containsNull = true)
| | | | | | |-- element: double (containsNull = true)
| | | |-- type: string (nullable = true)
| | |-- country: string (nullable = true)
| | |-- country_code: string (nullable = true)
| | |-- full_name: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- name: string (nullable = true)
| | |-- place_type: string (nullable = true)
| | |-- url: string (nullable = true)
| |-- possibly_sensitive: boolean (nullable = true)
| |-- retweet_count: long (nullable = true)
| |-- retweeted: boolean (nullable = true)
| |-- scopes: struct (nullable = true)
| | |-- followers: boolean (nullable = true)
| |-- source: string (nullable = true)
| |-- text: string (nullable = true)
| |-- truncated: boolean (nullable = true)
| |-- user: struct (nullable = true)
| | |-- contributors_enabled: boolean (nullable = true)
| | |-- created_at: string (nullable = true)
| | |-- default_profile: boolean (nullable = true)
| | |-- default_profile_image: boolean (nullable = true)
| | |-- description: string (nullable = true)
| | |-- favourites_count: long (nullable = true)
| | |-- follow_request_sent: string (nullable = true)
| | |-- followers_count: long (nullable = true)
| | |-- following: string (nullable = true)
| | |-- friends_count: long (nullable = true)
| | |-- geo_enabled: boolean (nullable = true)
| | |-- id: long (nullable = true)
| | |-- id_str: string (nullable = true)
| | |-- is_translator: boolean (nullable = true)
| | |-- lang: string (nullable = true)
| | |-- listed_count: long (nullable = true)
| | |-- location: string (nullable = true)
| | |-- name: string (nullable = true)
| | |-- notifications: string (nullable = true)
| | |-- profile_background_color: string (nullable = true)
| | |-- profile_background_image_url: string (nullable = true)
| | |-- profile_background_image_url_https: string (nullable = true)
| | |-- profile_background_tile: boolean (nullable = true)
| | |-- profile_banner_url: string (nullable = true)
| | |-- profile_image_url: string (nullable = true)
| | |-- profile_image_url_https: string (nullable = true)
| | |-- profile_link_color: string (nullable = true)
| | |-- profile_sidebar_border_color: string (nullable = true)
| | |-- profile_sidebar_fill_color: string (nullable = true)
| | |-- profile_text_color: string (nullable = true)
| | |-- profile_use_background_image: boolean (nullable = true)
| | |-- protected: boolean (nullable = true)
| | |-- screen_name: string (nullable = true)
| | |-- statuses_count: long (nullable = true)
| | |-- time_zone: string (nullable = true)
| | |-- url: string (nullable = true)
| | |-- utc_offset: long (nullable = true)
| | |-- verified: boolean (nullable = true)
|-- source: string (nullable = true)
|-- text: string (nullable = true)
|-- timestamp_ms: string (nullable = true)
|-- truncated: boolean (nullable = true)
|-- user: struct (nullable = true)
| |-- contributors_enabled: boolean (nullable = true)
| |-- created_at: string (nullable = true)
| |-- default_profile: boolean (nullable = true)
| |-- default_profile_image: boolean (nullable = true)
| |-- description: string (nullable = true)
| |-- favourites_count: long (nullable = true)
| |-- follow_request_sent: string (nullable = true)
| |-- followers_count: long (nullable = true)
| |-- following: string (nullable = true)
| |-- friends_count: long (nullable = true)
| |-- geo_enabled: boolean (nullable = true)
| |-- id: long (nullable = true)
| |-- id_str: string (nullable = true)
| |-- is_translator: boolean (nullable = true)
| |-- lang: string (nullable = true)
| |-- listed_count: long (nullable = true)
| |-- location: string (nullable = true)
| |-- name: string (nullable = true)
| |-- notifications: string (nullable = true)
| |-- profile_background_color: string (nullable = true)
| |-- profile_background_image_url: string (nullable = true)
| |-- profile_background_image_url_https: string (nullable = true)
| |-- profile_background_tile: boolean (nullable = true)
| |-- profile_banner_url: string (nullable = true)
| |-- profile_image_url: string (nullable = true)
| |-- profile_image_url_https: string (nullable = true)
| |-- profile_link_color: string (nullable = true)
| |-- profile_sidebar_border_color: string (nullable = true)
| |-- profile_sidebar_fill_color: string (nullable = true)
| |-- profile_text_color: string (nullable = true)
| |-- profile_use_background_image: boolean (nullable = true)
| |-- protected: boolean (nullable = true)
| |-- screen_name: string (nullable = true)
| |-- statuses_count: long (nullable = true)
| |-- time_zone: string (nullable = true)
| |-- url: string (nullable = true)
| |-- utc_offset: long (nullable = true)
| |-- verified: boolean (nullable = true)
In [18]:
sqlCtx.registerFunction("item_len", lambda x: len(x) if x else 0, IntegerType())
In [24]:
user_lang = sqlCtx.sql("SELECT user.lang, COUNT(*) FROM tweets GROUP BY user.lang").collect()
user_lang
Out[24]:
[Row(lang=u'el', _c1=3),
Row(lang=u'en', _c1=2145),
Row(lang=u'ro', _c1=1),
Row(lang=u'es', _c1=505),
Row(lang=u'ru', _c1=184),
Row(lang=u'fa', _c1=1),
Row(lang=u'fi', _c1=3),
Row(lang=u'fr', _c1=148),
Row(lang=u'sv', _c1=13),
Row(lang=u'th', _c1=31),
Row(lang=u'tr', _c1=200),
Row(lang=u'nl', _c1=17),
Row(lang=u'ar', _c1=333),
Row(lang=u'no', _c1=2),
Row(lang=u'he', _c1=1),
Row(lang=u'hi', _c1=3),
Row(lang=u'en-GB', _c1=3),
Row(lang=u'id', _c1=75),
Row(lang=u'ca', _c1=3),
Row(lang=u'en-gb', _c1=54),
Row(lang=u'it', _c1=46),
Row(lang=u'Select Language...', _c1=1),
Row(lang=u'pl', _c1=8),
Row(lang=u'ja', _c1=562),
Row(lang=u'cs', _c1=1),
Row(lang=u'zh-tw', _c1=2),
Row(lang=u'pt', _c1=225),
Row(lang=u'da', _c1=4),
Row(lang=None, _c1=610),
Row(lang=u'de', _c1=24),
Row(lang=u'ko', _c1=50)]
In [26]:
res = sqlCtx.sql("SELECT user.name, favorite_count, entities.hashtags, text,created_at, place FROM tweets where item_len(entities.hashtags) > 0 AND user.lang = 'en' LIMIT 10")
res.collect()
Out[26]:
[Row(name=u"~Harry's Kryptonite~", favorite_count=0, hashtags=[Row(indices=[96, 109], text=u'WeAreAllZayn'), Row(indices=[110, 134], text=u'WeAreAllZaynFollowParty')], text=u'RT @YesItsNaeNae: "Why are directioners pretending to be one direction?So immature"\n\nFandom rn\n\n#WeAreAllZayn\n#WeAreAllZaynFollowParty http\u2026', created_at=u'Sun Dec 21 17:28:43 +0000 2014', place=None),
Row(name=u'\u0639\u0627\u062f\u0644 \u0623\u062d\u0645\u062f', favorite_count=0, hashtags=[Row(indices=[13, 26], text=u'\u0642\u0631\u0648\u0628_\u0627\u0644\u0640\u0632\u0639\u064a\u0645'), Row(indices=[133, 140], text=u'FOLLOW'), Row(indices=[139, 140], text=u'RT'), Row(indices=[139, 140], text=u'FF')], text=u'RT @KSABDO: \U0001f499#\u0642\u0631\u0648\u0628_\u0627\u0644\u0640\u0632\u0639\u064a\u0645\U0001f499\n\n@KSABDO\U0001f448 @ss__aaaa\U0001f448 @zainsaudi\U0001f448 @sak5511\U0001f448 @m199z\U0001f448 @rima9027\U0001f448 @Hoda121232\U0001f448 @mtaeb58\U0001f448 @alameerah9991\U0001f448\n\n\u2714 #FOLLO\u2026', created_at=u'Sun Dec 21 17:28:44 +0000 2014', place=None),
Row(name=u'RE/MAX Tattersall', favorite_count=0, hashtags=[Row(indices=[40, 48], text=u'listing'), Row(indices=[67, 73], text=u'Aiken'), Row(indices=[74, 77], text=u'SC'), Row(indices=[101, 112], text=u'realestate')], text=u'Dick Salsitz would love to show you the #listing at 000 Belle Mead #Aiken #SC http://t.co/ZsQfHnjWwg #realestate', created_at=u'Sun Dec 21 17:28:44 +0000 2014', place=None),
Row(name=u'Jack McNeill', favorite_count=0, hashtags=[Row(indices=[14, 18], text=u'LFC'), Row(indices=[23, 27], text=u'AFC'), Row(indices=[120, 132], text=u'bbcfootball')], text=u'RT @BBCSport: #LFC 1-2 #AFC: Olivier Giroud gives Arsenal the lead with a well-worked team goal \nhttp://t.co/p4sSclNwan #bbcfootball', created_at=u'Sun Dec 21 17:28:44 +0000 2014', place=None),
Row(name=u'Jim Waddle', favorite_count=0, hashtags=[Row(indices=[41, 56], text=u'accountability'), Row(indices=[80, 91], text=u'discipline')], text=u'RT @LeadersServe: You\u2019ll need to rebrand #accountability to create a culture of #discipline.', created_at=u'Sun Dec 21 17:28:44 +0000 2014', place=None),
Row(name=u'Black Angel', favorite_count=0, hashtags=[Row(indices=[124, 129], text=u'NYPD'), Row(indices=[130, 143], text=u'ICantBreathe'), Row(indices=[143, 144], text=u'BlackLivesMatter')], text=u'RT @MardiHolland: http://t.co/SlQuTrFOxs this is crazy... how does 1 support LEO & threaten 2 kill ppl @ the same time? #NYPD #ICantBreathe\u2026', created_at=u'Sun Dec 21 17:28:44 +0000 2014', place=None),
Row(name=u'vijayendran', favorite_count=0, hashtags=[Row(indices=[0, 36], text=u'JILLA_KATHTHIBlockbusterYearOfVIJAY')], text=u'#JILLA_KATHTHIBlockbusterYearOfVIJAY real super star', created_at=u'Sun Dec 21 17:28:45 +0000 2014', place=None),
Row(name=u'Political Pirate', favorite_count=0, hashtags=[Row(indices=[86, 99], text=u'chuckschumer'), Row(indices=[100, 115], text=u'presidentobama')], text=u"'CR Omnibus' Vote: Good Dems, Bad Dems, Dems Who Are In-Bet http://t.co/9TCOoJyxBt\n #chuckschumer #presidentobama", created_at=u'Sun Dec 21 17:28:45 +0000 2014', place=None),
Row(name=u'Paul', favorite_count=0, hashtags=[Row(indices=[120, 127], text=u'ImAMan')], text=u'I know jack about cars, so popping the hood and replacing some headlight lamps is practically an engine rebuild for me. #ImAMan', created_at=u'Sun Dec 21 17:28:45 +0000 2014', place=None),
Row(name=u'\u0645\u062d\u0645\u0648\u062f \u0628\u0646 \u0645\u0627\u062c\u062f', favorite_count=0, hashtags=[Row(indices=[118, 130], text=u'\u064a\u0627\u0633\u0631_\u0627\u0644\u0639\u062a\u064a\u0642')], text=u'RT @alateeqy: \u0644\u0648 \u0623\u0646 \u0643\u0644 \u0645\u0646\u062a\u0642\u062f \u0644\u0644\u0633\u0624\u0644\u064a\u0646 \u0648\u0627\u0644\u0645\u062c\u062a\u0645\u0639 \u0628\u062f\u0623 \u0628\u0625\u0635\u0644\u0627\u062d \u0628\u064a\u062a\u0647 \u0648\u0623\u0628\u0646\u0627\u0626\u0647\u060c \u0644\u0643\u0627\u0646 \u0630\u0644\u0643 \u0623\u0641\u0636\u0644 \u0646\u0642\u062f\u060c \u0641\u0640(\u062d\u064a\u062b \u062a\u0643\u0648\u0646\u0648\u0627 \u064a\u0648\u0644\u0649 \u0639\u0644\u064a\u0643\u0645).\n\n#\u064a\u0627\u0633\u0631_\u0627\u0644\u0639\u062a\u064a\u0642\nhttp://t\u2026', created_at=u'Sun Dec 21 17:28:46 +0000 2014', place=None)]
In [ ]:
In [ ]:
Content source: anantasty/spark-examples
Similar notebooks: