In [28]:
%%bigquery udf --module extract_tokens
function extract_topic(tokens){
var trump = false, clinton = false;
tokens.forEach(function(token){
switch(token.content.toLowerCase()){
case "trump" :
case "realdonaldtrump" :
case "donald" :
trump = true; break;
case "hillary" :
case "clinton" :
case "hillaryclinton":
clinton = true;break;
}
})
var topic = 'common';
if(trump && !clinton)
topic = 'trump';
else if(clinton && !trump)
topic = 'clinton';
return topic;
}
/**
* @param {{tweet_object: string, polarity: float, magnitude: float, syntax:string }} r
* @param function({{topic : string, type: string, content:string}}) emitFn
*/
function(r, emitFn) {
try{
var tweet = JSON.parse(r.tweet_object);
var tokens = JSON.parse(r.syntax);
tokens.forEach(function(token){
if(token.content !== "&" && token.content!== "#")
emitFn({topic : topic, type : token.partOfSpeech, content : token.content.toLowerCase()})
})
}
catch(e){
}
}
In [29]:
%%sql --module words
part_of_speech="NOUN"
total = 10;
SELECT content, count(*) as count FROM extract_tokens(SELECT * FROM [in-full-gear:Dataset1.debates_tweets]
WHERE NOT REGEXP_MATCH(tweet_object, "GoogleJsonResponseException"))
WHERE type=$part_of_speech
GROUp BY content
ORDER BY count desc
LIMIT $total
In [30]:
import datalab.bigquery as bq
nouns = bq.Query(words, udf=extract_tokens, part_of_speech="NOUN", total=12);
In [31]:
%%chart bars --data nouns
title: Most common nouns used in tweets about the debate
height: 600
width: 900
Out[31]:
In [32]:
%%sql --module adjectives_by_topic
part_of_speech="ADJ"
total = 10;
topic = "common"
SELECT content, count(*) as count FROM extract_tokens(SELECT * FROM [in-full-gear:Dataset1.debates_tweets]
WHERE NOT REGEXP_MATCH(tweet_object, "GoogleJsonResponseException"))
WHERE type=$part_of_speech AND topic=$topic
GROUp BY content
ORDER BY count desc
LIMIT $total
In [33]:
trump_adj = bq.Query(adjectives_by_topic, udf=extract_tokens, topic="trump");
In [39]:
%%chart pie --data trump_adj
title: Most common adjectives used in tweets about Donald Trump
height: 600
width: 900
#colors: ["#E91D0E"]
Out[39]:
In [36]:
clinton_adj = bq.Query(adjectives_by_topic, udf=extract_tokens, topic="clinton");
In [40]:
%%chart pie --data clinton_adj
title: Most common adjectives used in tweets about Hillary Clinton
height: 600
width: 900
Out[40]:
In [49]:
%%bigquery udf --module extract_sentiment
function getStateName(placeName){
return placeName.substring(placeName.indexOf(',') + 1).trim();
}
function extract_topic(tokens){
var trump = false, clinton = false;
tokens.forEach(function(token){
switch(token.content.toLowerCase()){
case "trump" :
case "realdonaldtrump" :
case "donald" :
trump = true; break;
case "hillary" :
case "clinton" :
case "hillaryclinton":
clinton = true;break;
}
})
var topic = 'common';
if(trump && !clinton)
topic = 'trump';
else if(clinton && !trump)
topic = 'clinton';
return topic;
}
/**
* @param {{tweet_object: string, polarity: float, magnitude: float, syntax:string }} r
* @param function({{place:string, topic: string, polarity:float, magnitude:float}}) emitFn
*/
function(r, emitFn) {
try{
var tweet = JSON.parse(r.tweet_object);
var tokens = JSON.parse(r.syntax);
var topic = extract_topic(tokens);
//emitFn({topic : topic, polarity: r.polarity, magnitude: r.magnitude, text : tweet.text});
if(tweet && tweet.place && tweet.place.place_type==='city')
emitFn({topic:topic, place : getStateName(tweet.place.full_name), polarity: r.polarity, magnitude: r.magnitude});
}
catch(e){
}
}
In [50]:
%%sql --module sentiment
topic = "trump"
SELECT polarity, magnitude FROM extract_sentiment(SELECT * FROM [in-full-gear:Dataset1.debates_tweets]
WHERE NOT REGEXP_MATCH(tweet_object, "GoogleJsonResponseException"))
where topic = $topic
In [51]:
trump_sentiments = bq.Query(sentiment, udf=extract_sentiment, topic="trump");
In [58]:
%%chart scatter --data trump_sentiments
title: Distribution of sentiment for tweets about Trump
height: 400
width: 900
legend : none
hAxis:
title: polarity
vAxis:
title: magnitude
colors: ["E91D0E"]
Out[58]:
In [55]:
clinton_sentiments = bq.Query(sentiment, udf=extract_sentiment, topic="clinton");
In [56]:
%%chart scatter --data clinton_sentiments
title: Distribution of sentiment for tweets about Clinton
height: 400
width: 900
legend : none
hAxis:
title: polarity
vAxis:
title: magnitude
colors: ["#232066"]
Out[56]: