In [1]:
# import requests, StringIO, pandas as pd, json, re
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
In [2]:
#reference the tweet data from dashDB, 3 tables are pulled into dataframes
tweetsdf = sqlContext.read.format('jdbc').options(url='jdbc:db2://awh-yp-small02.services.dal.bluemix.net:50000/BLUDB', user='dash111118', password='XXXXXXXXXX', dbtable='dash111118.HYATT_TWEETS').load()
sentimentdf = sqlContext.read.format('jdbc').options(url='jdbc:db2://awh-yp-small02.services.dal.bluemix.net:50000/BLUDB', user='dash111118', password='XXXXXXXXXX', dbtable='dash111118.HYATT_SENTIMENTS').load()
locationdf = sqlContext.read.format('jdbc').options(url='jdbc:db2://awh-yp-small02.services.dal.bluemix.net:50000/BLUDB', user='dash111118', password='XXXXXXXXXX', dbtable='dash111118.HYATT_LOCATIONS').load()
In [3]:
#import packages, instantiate dataframe, make cloudant connection, load cloudant data into dataframe
import requests, StringIO, pandas as pd, json, re
#This connectects to a Cloudant repository containing dummy geographic data of Hyatt locations
hyattlocations_df = sqlContext.read.format("com.cloudant.spark").\
option("cloudant.host","ajlevine.cloudant.com").\
option("cloudant.username", "ajlevine").\
option("cloudant.password","XXXXXXXXXX").\
load("hyattlocations")
In [4]:
#test data for posterity
tweetsdf.printSchema()
In [5]:
hyattlocations_df.printSchema()
In [6]:
#convert tweet table to Pandas frame
tweetPD = tweetsdf.toPandas()
tweetPD['MESSAGE_ID'] = tweetPD['MESSAGE_ID'].map(lambda x: x.lstrip('tag:search.twitter.com,'))
tweetPD = tweetPD.set_index(tweetPD["MESSAGE_ID"])
tweetPD.drop(['MESSAGE_ID'], axis=1, inplace=True)
tweetPD.head(3)
Out[6]:
In [7]:
#convert Sentiment table to Pandas frame
sentimentPD = sentimentdf.toPandas()
sentimentPD['MESSAGE_ID'] = sentimentPD['MESSAGE_ID'].map(lambda x: x.lstrip('tag:search.twitter.com,'))
sentimentPD = sentimentPD.set_index(sentimentPD["MESSAGE_ID"])
sentimentPD.drop(['MESSAGE_ID'], axis=1, inplace=True)
sentimentPD.head(3)
Out[7]:
In [8]:
#convert location table to Pandas frame
locationdf = locationdf.filter(locationdf['USER_LOCATION'] != "None")
locationPD = locationdf.toPandas()
#strip out unnecessary text
locationPD['MESSAGE_ID'] = locationPD['MESSAGE_ID'].map(lambda x: x.lstrip('tag:search.twitter.com,'))
locationPD['USER_LOCATION'] = locationPD['USER_LOCATION'].map(lambda x: x.lstrip('POINT (').rstrip(')'))
#set message ID as the index
locationPD = locationPD.set_index(locationPD["MESSAGE_ID"])
locationPD.drop(['MESSAGE_ID'], axis=1, inplace=True)
locationPD.drop(['MESSAGE_LOCATION'], axis=1, inplace=True)
locationPD.head(3)
Out[8]:
In [9]:
#convert location data from cloudant to Pandas frame
hyattlocationsPD = hyattlocations_df.toPandas()
In [10]:
hyattlocationsPD.head(3)
Out[10]:
In [11]:
#split user location column into latitude and longitude columns
foo = lambda x: pd.Series([i for i in reversed(x.split(' '))])
rev = locationPD['USER_LOCATION'].apply(foo)
rev.rename(columns={0:'Latitude',1:'Longitude'},inplace=True)
rev = rev[['Latitude','Longitude']]
locationPD = locationPD.join(rev, on=None, how='inner')
locationPD.drop(['USER_LOCATION'], axis=1, inplace=True)
#join sentiment table with tweet table on ID column
sentimentJoin = tweetPD.join(sentimentPD, on=None, how='inner')
sentimentJoin.head(3)
Out[11]:
In [12]:
#delete any records that contain Hyatt in the name
#This strips out any tweets originating from Hyatt, and any tweets by someone who's name contains "Hyatt" that might skew the results
sentimentJoin = sentimentJoin[~sentimentJoin.USER_DISPLAY_NAME.str.contains('hyatt')]
sentimentJoin = sentimentJoin[~sentimentJoin.USER_DISPLAY_NAME.str.contains('Hyatt')]
sentimentJoin = sentimentJoin.join(locationPD, on=None, how='inner')
In [13]:
#drop some unused columns
del sentimentJoin['MESSAGE_FAVORITES_COUNT']
del sentimentJoin['MESSAGE_INREPLYTO_URL']
del sentimentJoin['MESSAGE_URL']
del sentimentJoin['MESSAGE_LANGUAGE']
del sentimentJoin['MESSAGE_GENERATOR_DISPLAY_NAME']
del sentimentJoin['USER_STATUSES_COUNT']
del sentimentJoin['USER_SUMMARY']
del sentimentJoin['MESSAGE_ACTION']
del sentimentJoin['MESSAGE_COUNTRY_CODE']
del sentimentJoin['USER_LISTED_COUNT']
del sentimentJoin['USER_SUB_REGION']
del sentimentJoin['USER_REGISTER_TIME']
del sentimentJoin['MESSAGE_RETWEET_COUNT']
del sentimentJoin['USER_GENDER']
del sentimentJoin['USER_FAVORITES_COUNT']
del sentimentJoin['USER_IMAGE_URL']
del sentimentJoin['USER_URL']
del sentimentJoin['MESSAGE_COUNTRY']
del sentimentJoin['MESSAGE_LOCATION_DISPLAY_NAME']
del sentimentJoin['MESSAGE_LOCATION']
del sentimentJoin['USER_FOLLOWERS_COUNT']
del sentimentJoin['USER_FRIENDS_COUNT']
del sentimentJoin['USER_COUNTRY_CODE']
In [14]:
#install Seaborn statistical visualization tool
!pip install --user seaborn
In [15]:
%matplotlib inline
import matplotlib.pyplot as plt
# matplotlib.patches allows us create colored patches, we can use for legends in plots
import matplotlib.patches as mpatches
# seaborn also builds on matplotlib and adds graphical features and new plot types
import seaborn as sns
In [16]:
#cast data into float type
sentimentJoin['Latitude'] = sentimentJoin['Latitude'].astype(float)
sentimentJoin['Longitude'] = sentimentJoin['Longitude'].astype(float)
hyattlocationsPD['latitude'] = hyattlocationsPD['latitude'].astype(float)
hyattlocationsPD['longitude'] = hyattlocationsPD['longitude'].astype(float)
In [17]:
#adjust settings
sns.set_style("darkgrid")
plt.figure(figsize=(15,10))
#create scatterplots
plt.scatter(sentimentJoin.Longitude, sentimentJoin.Latitude, alpha=.9, s=4, color='darkseagreen')
plt.scatter(hyattlocationsPD.longitude, hyattlocationsPD.latitude, alpha=.9, s=4, color='purple')
#adjust more settings
plt.title('Locations of Tweets Across the World', size=25)
plt.ylim((-45,65))
plt.xlim((-150,150))
plt.xlabel('Longitude',size=2)
plt.ylabel('Latitude',size=2)
plt.show()
In [18]:
#adjust settings
sns.set_style("darkgrid")
plt.figure(figsize=(15,10))
#create scatterplots
plt.scatter(sentimentJoin.Longitude, sentimentJoin.Latitude, alpha=.9, s=4, color='darkseagreen')
plt.scatter(hyattlocationsPD.longitude, hyattlocationsPD.latitude, alpha=.9, s=4, color='purple')
#adjust more settings
plt.title('Locations of Tweets in America', size=25)
plt.ylim((20,55))
plt.xlim((-150,-50))
plt.xlabel('Longitude',size=20)
plt.ylabel('Latitude',size=20)
plt.show()
In [19]:
#break up into positive and negative tweets
positiveTweets = sentimentJoin[sentimentJoin.SENTIMENT_POLARITY.str.contains('POSITIVE')]
negativeTweets = sentimentJoin[sentimentJoin.SENTIMENT_POLARITY.str.contains('NEGATIVE')]
plt.figure(figsize=(15,10), dpi=0.1)
#create scatterplots
plt.scatter(negativeTweets.Longitude, negativeTweets.Latitude, color='red', s=20, marker ='.')
plt.scatter(positiveTweets.Longitude, positiveTweets.Latitude, s=10, color='green', marker ='.')
plt.scatter(hyattlocationsPD.longitude, hyattlocationsPD.latitude, alpha=.9, s=1, color='purple')
#create legend
red_patch = mpatches.Patch(color='red', label='Negative Tweets')
green_patch = mpatches.Patch(color='green', label='Positive Tweets')
#adjust more settings
plt.title('Positive and Negative Tweets', size=20)
plt.ylim((20,55))
plt.xlim((-150,-50))
plt.xlabel('Longitude',size=20)
plt.ylabel('Latitude',size=20)
plt.show()
In [20]:
#break up into positive and negative tweets
positiveTweets = sentimentJoin[sentimentJoin.SENTIMENT_POLARITY.str.contains('POSITIVE')]
negativeTweets = sentimentJoin[sentimentJoin.SENTIMENT_POLARITY.str.contains('NEGATIVE')]
plt.figure(figsize=(15,10), dpi=0.1)
#create scatterplots
plt.scatter(negativeTweets.Longitude, negativeTweets.Latitude, color='red', s=20, marker ='.')
plt.scatter(positiveTweets.Longitude, positiveTweets.Latitude, s=20, color='green', marker ='.')
plt.scatter(hyattlocationsPD.longitude, hyattlocationsPD.latitude, alpha=.9, s=10, color='purple')
#create legend
red_patch = mpatches.Patch(color='red', label='Negative Tweets')
green_patch = mpatches.Patch(color='green', label='Positive Tweets')
#adjust more settings
plt.title('Positive and Negative Tweets on the East Coast', size=20)
plt.ylim((37,42))
plt.xlim((-80,-65))
plt.xlabel('Longitude',size=20)
plt.ylabel('Latitude',size=20)
plt.show()
In [21]:
!pip install --user tweepy
In [22]:
#method to send private twitter message to defined user.
import tweepy
def get_api(cfg):
auth = tweepy.OAuthHandler(cfg['consumer_key'], cfg['consumer_secret'])
auth.set_access_token(cfg['access_token'], cfg['access_token_secret'])
return tweepy.API(auth)
def sendDM(user,tweet):
# Fill in the values noted in previous step here
cfg = {
"consumer_key" : "XXXXXXXXXXXXXXXXXXXXXXX",
"consumer_secret" : "XXXXXXXXXXXXXXXXXXXXXXX",
"access_token" : "XXXXXXXXXXXXXXXXXXXXXXX",
"access_token_secret" : "XXXXXXXXXXXXXXXXXXXXXXX"
}
api = get_api(cfg)
status = api.send_direct_message(screen_name=user,text=tweet)
In [23]:
sendDM("ajlevine4e","We're Sorry you had a bad experience at a Hyatt Brand Hotel, Please take 10% off your next stay by using checkout code WERESORRY upon your next booking. For additional help, please contact us 24hours/day at 555-555-5555")
In [ ]: