In [256]:
import requests
import urllib
import warnings
import pandas as pd
import datetime
import os,glob
import csv
import io
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
#Constant
def MY_FILE():
return "Data/Sunday-01-Box-Office/"
In [257]:
if not os.path.exists(MY_FILE()):
os.makedirs(MY_FILE())
for filename in glob.glob(MY_FILE()+"OP_*"):
os.remove(filename)
In [258]:
# ----------------------------------------------------------------------------------------------------------------
# collect_autosuggestions
#
# parameters:
# "source" is either "google" or "bing"
# "tld" stands for "top level domain" and can be any of the 2-letter country codes listed
# here where google operates: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
# "lang" is the language of the suggestions returned, should be two letter codes
# from here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
# "query" is the query that you would like to see autocompleted
# ----------------------------------------------------------------------------------------------------------------
def collect_autosuggestions(source, tld, lang, query):
if source == "google":
# Some info on this api: http://shreyaschand.com/blog/2013/01/03/google-autocomplete-api/
url = 'http://www.google.'+tld+'/complete/search?&client=firefox&%s' % (urllib.parse.urlencode(
{'q': query.encode('utf-8'), 'hl': lang}))
elif source == "bing":
# Note: for Bing the language is controlled by the tld, so the lang parameter will have no effect on its own
url = 'http://api.bing.com/osjson.aspx?%s' % (urllib.parse.urlencode(
{'query': query.encode('utf-8'), 'cc': tld}))
r = requests.get(url)
suggestions = r.json()[1]
return suggestions
In [259]:
movies_List = pd.read_csv("Data/BoxOffice.csv",encoding = "ISO-8859-1",low_memory=False,
names=["Ranking", 'Last Week Ranking','Movie Title','Movie Rating'])
movies_List
Out[259]:
In [260]:
def collect_complete_Suggestions(search_term):
#print ("Analysing Search Enginee Suggestion for {}".format(search_term))
suggestions_google = collect_autosuggestions("google", "com", "en", search_term)
suggestions_bing = collect_autosuggestions("bing", "com", "en", search_term)
suggestions_google_df = pd.DataFrame({"suggestion": suggestions_google})
suggestions_google_df["Movie Name"] = search_term
suggestions_google_df["Search Enginee"] = "Google"
suggestions_google_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
suggestions_google_df["Order Number"]=suggestions_google_df.index+1
suggestions_bing_df = pd.DataFrame({"suggestion": suggestions_bing})
suggestions_bing_df["Movie Name"] = search_term
suggestions_bing_df["Search Enginee"] = "Bing"
suggestions_bing_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
suggestions_bing_df["Order Number"]=suggestions_bing_df.index+1
suggestions_bing_df = suggestions_bing_df[suggestions_bing_df.index < 10]
suggestions_df = suggestions_google_df.append(suggestions_bing_df, ignore_index=True)
suggestions_df = suggestions_df[["Order Number",'Search Enginee',"Movie Name","suggestion","datetime"]]
suggestions_df.to_csv(MY_FILE()+'OP_Movie_Search_Suggestions.csv',mode = 'a',header=False,index=False)
#print ("====================Analysis Completed=============================")
#End of Function
In [261]:
def collect_detail_Suggestions(movie_name):
#print ("Analysing Detailed Search Enginee Suggestion for {}".format(movie_name))
#Writing Headers
file_name = MY_FILE()+'OP_'+movie_name+'.csv'
with open(file_name, 'w+', newline='') as fp:
a = csv.writer(fp, delimiter=',')
data = [["Order Number",'Search Enginee', "Movie Name","Sub Term","Suggestion","Date & Time",'Character Typed']]
a.writerows(data)
for i, character in enumerate(movie_name):
if i > 3:
title_Name=movie_name
title = title_Name[0:i]
suggestions_google = collect_autosuggestions("google", "com", "en", title)
suggestions_bing = collect_autosuggestions("bing", "com", "en", title)
suggestions_google_df = pd.DataFrame({"suggestion": suggestions_google})
suggestions_google_df["Movie Name"] = movie_name
suggestions_google_df["Sub Term"] = title
suggestions_google_df["Search Enginee"] = "Google"
suggestions_google_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
suggestions_google_df["Order Number"]=suggestions_google_df.index+1
suggestions_google_df["Character_Typed"]=i
suggestions_bing_df = pd.DataFrame({"suggestion": suggestions_bing})
suggestions_bing_df["Movie Name"] = movie_name
suggestions_bing_df["Sub Term"] = title
suggestions_bing_df["Search Enginee"] = "Bing"
suggestions_bing_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
suggestions_bing_df["Order Number"]=suggestions_bing_df.index+1
suggestions_bing_df["Character_Typed"]=i
suggestions_bing_df = suggestions_bing_df[suggestions_bing_df.index < 10]
suggestions_df = suggestions_google_df.append(suggestions_bing_df, ignore_index=True)
suggestions_df = suggestions_df[["Order Number",'Search Enginee', "Movie Name","Sub Term","suggestion","datetime"
,'Character_Typed']]
suggestions_df.to_csv(file_name,mode = 'a',header=False,index=False)
#print ("=================================================")
#End of Function
In [262]:
def perform_detailed_analysis(movie_name):
detailed_List = pd.read_csv(MY_FILE()+'OP_'+movie_name+'.csv')
summary_list = detailed_List.groupby(['Search Enginee', 'Suggestion']).size().reset_index(name='occurrence')
summary_list = summary_list.sort_index(by=['occurrence'], ascending=[False])
with open(MY_FILE()+'OP_'+movie_name+'_Summary.csv', 'w+', newline='') as fp:
a = csv.writer(fp, delimiter=',')
data = [["Search Enginee",'Suggestion',"Occurrence"]]
a.writerows(data)
summary_list.to_csv(MY_FILE()+'OP_'+movie_name+'_Summary.csv',mode = 'a',header=False,index=False)
#End of Function
In [263]:
def filter_piracy_terms(movie_name):
detail_movie_data = pd.read_csv(MY_FILE()+'OP_'+movie_name+'.csv')
piracy_tag = "online|full movie|torrent|putlocker|free"
filtered_List = detail_movie_data[detail_movie_data['Suggestion'].str.contains(piracy_tag)==True]
filtered_List = filtered_List.sort_index(by=['Character Typed'], ascending=[True])
with open(MY_FILE()+'OP_'+movie_name+'_Filtered.csv', 'w+', newline='') as fp:
a = csv.writer(fp, delimiter=',')
data = [["Order Number",'Search Enginee', "Movie Name","Sub Term","Suggestion","Date & Time",'Character Typed']]
a.writerows(data)
filtered_List.to_csv(MY_FILE()+'OP_'+movie_name+'_Filtered.csv',mode = 'a',header=False,index=False)
summary_movie_data = pd.read_csv(MY_FILE()+'OP_'+movie_name+'_Summary.csv')
Summary_filtered_List = summary_movie_data[summary_movie_data['Suggestion'].str.contains(piracy_tag)==True]
with open(MY_FILE()+'OP_'+movie_name+'_Summary_Filtered.csv', 'w+', newline='') as fp:
a = csv.writer(fp, delimiter=',')
data = [["Search Enginee",'Suggestion',"Occurrence"]]
a.writerows(data)
Summary_filtered_List.to_csv(MY_FILE()+'OP_'+movie_name+'_Summary_Filtered.csv',mode = 'a',header=False,index=False)
#End of Function
In [264]:
#Looping through each movies to anaylse search enginee suggestions
#Writing Headers
with open(MY_FILE()+'OP_Movie_Search_Suggestions.csv', 'w+', newline='') as fp:
a = csv.writer(fp, delimiter=',')
data = [["Order Number",'Search Enginee',"Movie Name","Suggestion","Date & Time"]]
a.writerows(data)
for index, row in movies_List.iterrows():
print ("Analysis For {} ".format(row['Movie Title']))
collect_complete_Suggestions(row['Movie Title'])
collect_detail_Suggestions(row['Movie Title'])
perform_detailed_analysis(row['Movie Title'])
filter_piracy_terms(row['Movie Title'])
print ("Analysis Completed")
In [265]:
first_occurrence = pd.DataFrame()
least_order = pd.DataFrame()
for index, row in movies_List.iterrows():
Movie_Name = row['Movie Title']
filtered_data = pd.read_csv(MY_FILE()+'OP_'+Movie_Name+'_Filtered.csv')
first_row = filtered_data.head(1)
first_occurrence = first_occurrence.append(first_row, ignore_index=True)
filtered_data = filtered_data.sort_index(by=['Order Number'], ascending=[True])
first_row = filtered_data.head(1)
#least_order = least_order.append(first_row[["Movie Name","Order Number"]], ignore_index=True)
filtered_summary_data = pd.read_csv(MY_FILE()+'OP_'+Movie_Name+'_Summary_Filtered.csv')
filtered_summary_data = filtered_summary_data.groupby(by=['Search Enginee'])['Occurrence'].sum().reset_index(name='Total')
if filtered_summary_data.empty:
print ("No Data for the movie:{} ".format(Movie_Name))
first_row.loc[len(first_row)]=['0','0',Movie_Name,'0','0','0','0']
first_row["Total Bing Suggestion"] = 0
first_row["Total Google Suggestion"] = 0
least_order = least_order.append(first_row[["Movie Name","Character Typed","Order Number","Total Bing Suggestion"
,"Total Google Suggestion"]], ignore_index=True)
first_occurrence = first_occurrence.append(first_row, ignore_index=True)
print (least_order)
else:
if(len(filtered_summary_data.index)==2):
if filtered_summary_data.iloc[0]['Search Enginee'] == 'Bing':
first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[0]['Total']
first_row["Total Google Suggestion"] = filtered_summary_data.iloc[1]["Total"]
else:
first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[1]['Total']
first_row["Total Google Suggestion"] = filtered_summary_data.iloc[0]["Total"]
else:
if filtered_summary_data.iloc[0]['Search Enginee'] == 'Bing':
first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[0]['Total']
first_row["Total Google Suggestion"] = 0
else:
first_row["Total Bing Suggestion"] = 0
first_row["Total Google Suggestion"] = filtered_summary_data.iloc[0]["Total"]
least_order = least_order.append(first_row[["Movie Name","Character Typed","Order Number","Total Bing Suggestion"
,"Total Google Suggestion"]], ignore_index=True)
first_occurrence = first_occurrence[["Movie Name","Character Typed","Order Number"]]
Final_dataframe = first_occurrence.merge(least_order,on='Movie Name')
Final_dataframe["Total Suggestions"] = Final_dataframe["Total Bing Suggestion"] + Final_dataframe["Total Google Suggestion"]
Final_dataframe[["Movie Name","Character Typed_x","Order Number_x","Order Number_y","Character Typed_y",
"Total Bing Suggestion","Total Google Suggestion","Total Suggestions"]]
Out[265]:
In [266]:
# Learn about API authentication here: https://plot.ly/pandas/getting-started
# Find your api_key here: https://plot.ly/settings/api
# Cufflinks binds plotly to pandas dataframes in IPython notebook. Read more
#sudo pip install cufflinks
#sudo pip install plotly
import plotly.plotly as py
import cufflinks as cf
import pandas as pd
py.sign_in('journProject', 'rtxpnqa904')
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
Final_dataframe.iplot(kind='bubble', x='Character Typed_x', y='Order Number_x', size='Total Suggestions',
text='Movie Name', xTitle='Number of Characters at first occurrence',
yTitle='Position in suggestion list during first occurrence',
filename='cufflinks/simple-bubble-chart')
In [ ]:
# Learn about API authentication here: https://plot.ly/pandas/getting-started
# Find your api_key here: https://plot.ly/settings/api
# Cufflinks binds plotly to pandas dataframes in IPython notebook. Read more
#sudo pip install cufflinks
#sudo pip install plotly
import plotly.plotly as py
import cufflinks as cf
import pandas as pd
py.sign_in('journProject', 'rtxpnqa904')
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
Final_dataframe.iplot(kind='bubble', x='Character Typed_y', y='Order Number_y', size='Total Suggestions',
text='Movie Name', xTitle='Number of Characters used during top position',
yTitle='Top position suggested',
filename='cufflinks/simple-bubble-chart')
In [ ]:
Final_plot= Final_dataframe.rename(columns={'Character Typed_x': 'char_x', 'Character Typed_y': 'char_y',
'Order Number_x':'order_x','Order Number_y':'order_y','Movie Name':'movie_name'})
Final_plot
In [ ]:
fig = {
'data': [
{'x': Final_plot.char_x, 'y': Final_plot.order_x, 'text': Final_plot.movie_name,
'mode': 'markers', 'name': 'Lowest Character VS Position'},
{'x': Final_plot.char_y, 'y': Final_plot.order_y, 'text': Final_plot.movie_name,
'mode': 'markers', 'name': 'Highest Position VS Character'}
],
'layout': {
'xaxis': {'title': 'Number of Characters', 'type': 'log'},
'yaxis': {'title': "Position"}
}
}
py.iplot(fig, filename='cufflinks/multiple-scatter')
for index, row in movies_List.iterrows(): Movie_Name = row['Movie Title'] summary_data_for_results = pd.read_csv(MYFILE()+'OP'+Movie_Name+'_Summary_Filtered.csv') for s_index,s_row in summary_data_for_results.iterrows(): custom_google_search(s_row['Suggestion'])
def custom_google_search(suggestion):
url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' % (urllib.parse.urlencode(
{'q': suggestion}))
search_response = requests.get(url)
results = search_response.json()
data = results['responseData']
hits = data['results']
user_ids = []
for h in hits:
user_ids.append(h['visibleUrl']);
#print ('For more results, see %s' % data['cursor']['moreResultsUrl'])
print (user_ids)
return user_ids