Search Engine's AutoSuggestions impact of Movie Piracy

Final Project for JOUR 479/779 Understanding search enginee autosuggestions impact on Movie Piracy


In [9]:
import requests
import urllib
import warnings
import pandas as pd
import datetime
import os,glob
import csv
import io
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

#Constant
def MY_FILE():
    return "Data/May_2_week/"

In [10]:
if not os.path.exists(MY_FILE()):
    os.makedirs(MY_FILE())
    
for filename in glob.glob(MY_FILE()+"OP_*"):
    os.remove(filename)

In [11]:
# ----------------------------------------------------------------------------------------------------------------
# collect_autosuggestions
#
# parameters:
# "source" is either "google" or "bing"
# "tld" stands for "top level domain" and can be any of the 2-letter country codes listed 
#  here where google operates: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
# "lang" is the language of the suggestions returned, should be two letter codes 
# from here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
# "query" is the query that you would like to see autocompleted
# ----------------------------------------------------------------------------------------------------------------

def collect_autosuggestions(source, tld, lang, query):
    if source == "google":
        # Some info on this api: http://shreyaschand.com/blog/2013/01/03/google-autocomplete-api/
        url = 'http://www.google.'+tld+'/complete/search?&client=firefox&%s' % (urllib.parse.urlencode(
                {'q': query.encode('utf-8'), 'hl': lang}))
       
    elif source == "bing":
        # Note: for Bing the language is controlled by the tld, so the lang parameter will have no effect on its own
        url = 'http://api.bing.com/osjson.aspx?%s' % (urllib.parse.urlencode(
                {'query': query.encode('utf-8'), 'cc': tld}))
   
    r = requests.get(url)
    suggestions = r.json()[1]
    return suggestions

In [12]:
movies_List = pd.read_csv("Final/TorrentFreak_April_16.csv",encoding = "ISO-8859-1",low_memory=False,
                          names=["Ranking", 'Last Week Ranking','Movie Title','Movie Rating'])
movies_List


Out[12]:
Ranking Last Week Ranking Movie Title Movie Rating
0 1 -1 Star Wars: The Force Awakens 8.3
1 2 -2 Deadpool 8.6
2 3 -9 Batman v Superman: Dawn of Justice 7.5
3 4 -4 The Revenant 8.2
4 5 -3 Ice Age: The Great Egg-Scapade 6.1
5 6 0 Hail Caesar! 6.8
6 7 -6 Kung Fu Panda 3 8.0
7 8 0 Pandemic 4.6
8 9 -8 The Hateful Eight 8.0
9 10 -5 The 5th Wave 5.4

The Top Priated Movie According to Torrent Freak


In [13]:
#Method to collecte complete suggestions for the movie name
def collect_complete_Suggestions(search_term):
    #print ("Analysing Search Enginee Suggestion for {}".format(search_term))
    
    suggestions_google = collect_autosuggestions("google", "com", "en", search_term)    
    suggestions_bing = collect_autosuggestions("bing", "com", "en", search_term)    

    suggestions_google_df = pd.DataFrame({"suggestion": suggestions_google})
    suggestions_google_df["Movie Name"] = search_term
    suggestions_google_df["Search Enginee"] = "Google"
    suggestions_google_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
    suggestions_google_df["Order Number"]=suggestions_google_df.index+1
    
    suggestions_bing_df = pd.DataFrame({"suggestion": suggestions_bing})
    suggestions_bing_df["Movie Name"] = search_term
    suggestions_bing_df["Search Enginee"] = "Bing"
    suggestions_bing_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
    suggestions_bing_df["Order Number"]=suggestions_bing_df.index+1

    suggestions_bing_df = suggestions_bing_df[suggestions_bing_df.index < 10]

    suggestions_df = suggestions_google_df.append(suggestions_bing_df, ignore_index=True)
    
    suggestions_df = suggestions_df[["Order Number",'Search Enginee',"Movie Name","suggestion","datetime"]]
    suggestions_df.to_csv(MY_FILE()+'OP_Movie_Search_Suggestions.csv',mode = 'a',header=False,index=False)
    
    #print ("====================Analysis Completed=============================")
#End of Function

In [14]:
#Method to collecte detail suggestions for the movie name
def collect_detail_Suggestions(movie_name):
    
    #print ("Analysing Detailed Search Enginee Suggestion for {}".format(movie_name))
     #Writing Headers
    file_name = MY_FILE()+'OP_'+movie_name+'.csv'
    with open(file_name, 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Order Number",'Search Enginee', "Movie Name","Sub Term","Suggestion","Date & Time",'Character Typed']]
        a.writerows(data)
    
    for i, character in enumerate(movie_name):
        if i > 3:
            title_Name=movie_name
            title = title_Name[0:i]
            suggestions_google = collect_autosuggestions("google", "com", "en", title)    
            suggestions_bing = collect_autosuggestions("bing", "com", "en", title)    

            suggestions_google_df = pd.DataFrame({"suggestion": suggestions_google})
            suggestions_google_df["Movie Name"] = movie_name
            suggestions_google_df["Sub Term"] = title
            suggestions_google_df["Search Enginee"] = "Google"
            suggestions_google_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
            suggestions_google_df["Order Number"]=suggestions_google_df.index+1
            suggestions_google_df["Character_Typed"]=i

            
            suggestions_bing_df = pd.DataFrame({"suggestion": suggestions_bing})
            suggestions_bing_df["Movie Name"] = movie_name
            suggestions_bing_df["Sub Term"] = title
            suggestions_bing_df["Search Enginee"] = "Bing"
            suggestions_bing_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
            suggestions_bing_df["Order Number"]=suggestions_bing_df.index+1
            suggestions_bing_df["Character_Typed"]=i


            suggestions_bing_df = suggestions_bing_df[suggestions_bing_df.index < 10]
            
            suggestions_df = suggestions_google_df.append(suggestions_bing_df, ignore_index=True) 
            suggestions_df = suggestions_df[["Order Number",'Search Enginee', "Movie Name","Sub Term","suggestion","datetime"
                                            ,'Character_Typed']]
            suggestions_df.to_csv(file_name,mode = 'a',header=False,index=False)            
    #print ("=================================================")   
#End of Function

In [15]:
def perform_detailed_analysis(movie_name):
    detailed_List = pd.read_csv(MY_FILE()+'OP_'+movie_name+'.csv')
    summary_list = detailed_List.groupby(['Search Enginee', 'Suggestion']).size().reset_index(name='occurrence')
    summary_list = summary_list.sort_index(by=['occurrence'], ascending=[False])
    
    with open(MY_FILE()+'OP_'+movie_name+'_Summary.csv', 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Search Enginee",'Suggestion',"Occurrence"]]
        a.writerows(data)
    summary_list.to_csv(MY_FILE()+'OP_'+movie_name+'_Summary.csv',mode = 'a',header=False,index=False)  
#End of Function

In [16]:
def filter_piracy_terms(movie_name):
    detail_movie_data = pd.read_csv(MY_FILE()+'OP_'+movie_name+'.csv')
    piracy_tag = "online|full movie|torrent|putlocker|free"
    filtered_List = detail_movie_data[detail_movie_data['Suggestion'].str.contains(piracy_tag)==True]
    filtered_List = filtered_List.sort_index(by=['Character Typed'], ascending=[True])
    
    with open(MY_FILE()+'OP_'+movie_name+'_Filtered.csv', 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Order Number",'Search Enginee', "Movie Name","Sub Term","Suggestion","Date & Time",'Character Typed']]
        a.writerows(data)
        
    filtered_List.to_csv(MY_FILE()+'OP_'+movie_name+'_Filtered.csv',mode = 'a',header=False,index=False)
    
    summary_movie_data = pd.read_csv(MY_FILE()+'OP_'+movie_name+'_Summary.csv')
    Summary_filtered_List = summary_movie_data[summary_movie_data['Suggestion'].str.contains(piracy_tag)==True] 
    
    with open(MY_FILE()+'OP_'+movie_name+'_Summary_Filtered.csv', 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Search Enginee",'Suggestion',"Occurrence"]]
        a.writerows(data)
    
    Summary_filtered_List.to_csv(MY_FILE()+'OP_'+movie_name+'_Summary_Filtered.csv',mode = 'a',header=False,index=False)  
#End of Function

In [17]:
#Looping through each movies to anaylse search enginee suggestions
#Writing Headers
with open(MY_FILE()+'OP_Movie_Search_Suggestions.csv', 'w+', newline='') as fp:
    a = csv.writer(fp, delimiter=',')
    data = [["Order Number",'Search Enginee',"Movie Name","Suggestion","Date & Time"]]
    a.writerows(data)
    
for index, row in movies_List.iterrows():
    print ("Analysis For {} ".format(row['Movie Title']))
    collect_complete_Suggestions(row['Movie Title'])
    collect_detail_Suggestions(row['Movie Title'])
    perform_detailed_analysis(row['Movie Title'])
    filter_piracy_terms(row['Movie Title'])
    print ("Analysis Completed")


Analysis For Star Wars: The Force Awakens 
Analysis Completed
Analysis For Deadpool 
Analysis Completed
Analysis For Batman v Superman: Dawn of Justice 
Analysis Completed
Analysis For The Revenant 
Analysis Completed
Analysis For Ice Age: The Great Egg-Scapade 
Analysis Completed
Analysis For Hail Caesar! 
Analysis Completed
Analysis For Kung Fu Panda 3 
Analysis Completed
Analysis For Pandemic 
Analysis Completed
Analysis For The Hateful Eight 
Analysis Completed
Analysis For The 5th Wave 
Analysis Completed

In [18]:
first_occurrence = pd.DataFrame()
least_order = pd.DataFrame()

for index, row in movies_List.iterrows():
    Movie_Name = row['Movie Title']
    filtered_data = pd.read_csv(MY_FILE()+'OP_'+Movie_Name+'_Filtered.csv')
    
    first_row = filtered_data.head(1)
    first_occurrence = first_occurrence.append(first_row, ignore_index=True) 
    
    filtered_data = filtered_data.sort_index(by=['Order Number'], ascending=[True])
    first_row = filtered_data.head(1)
    #least_order = least_order.append(first_row[["Movie Name","Order Number"]], ignore_index=True) 
    
    filtered_summary_data = pd.read_csv(MY_FILE()+'OP_'+Movie_Name+'_Summary_Filtered.csv')
    filtered_summary_data = filtered_summary_data.groupby(by=['Search Enginee'])['Occurrence'].sum().reset_index(name='Total')
    if filtered_summary_data.empty:        
        print ("No Data for the movie:{} ".format(Movie_Name))  
        first_row.loc[len(first_row)]=['0','0',Movie_Name,'0','0','0','0']
        first_row["Total Bing Suggestion"] = 0
        first_row["Total Google Suggestion"] = 0
        least_order = least_order.append(first_row[["Movie Name","Character Typed","Order Number","Total Bing Suggestion"
                                                    ,"Total Google Suggestion"]], ignore_index=True) 
        first_occurrence = first_occurrence.append(first_row, ignore_index=True) 
    else:        
        if(len(filtered_summary_data.index)==2):
            if filtered_summary_data.iloc[0]['Search Enginee'] == 'Bing':
                first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[0]['Total']
                first_row["Total Google Suggestion"] = filtered_summary_data.iloc[1]["Total"]
            else:
                first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[1]['Total']
                first_row["Total Google Suggestion"] = filtered_summary_data.iloc[0]["Total"]
        else:
            if filtered_summary_data.iloc[0]['Search Enginee'] == 'Bing':
                first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[0]['Total']
                first_row["Total Google Suggestion"] = 0
            else:
                first_row["Total Bing Suggestion"] = 0
                first_row["Total Google Suggestion"] = filtered_summary_data.iloc[0]["Total"]

        least_order = least_order.append(first_row[["Movie Name","Character Typed","Order Number","Total Bing Suggestion"
                                                    ,"Total Google Suggestion"]], ignore_index=True) 
    
first_occurrence = first_occurrence[["Movie Name","Character Typed","Order Number"]]
Final_dataframe = first_occurrence.merge(least_order,on='Movie Name')
Final_dataframe["Total Suggestions"] = Final_dataframe["Total Bing Suggestion"] + Final_dataframe["Total Google Suggestion"]
Final_dataframe[["Movie Name","Character Typed_x","Order Number_x","Order Number_y","Character Typed_y",
                "Total Bing Suggestion","Total Google Suggestion","Total Suggestions"]]


Out[18]:
Movie Name Character Typed_x Order Number_x Order Number_y Character Typed_y Total Bing Suggestion Total Google Suggestion Total Suggestions
0 Star Wars: The Force Awakens 5 10 2 18 12 35 47
1 Deadpool 4 6 3 5 19 4 23
2 Batman v Superman: Dawn of Justice 4 8 2 24 63 52 115
3 The Revenant 7 5 4 7 9 5 14
4 Ice Age: The Great Egg-Scapade 10 8 4 22 4 22 26
5 Hail Caesar! 6 6 6 6 6 0 6
6 Kung Fu Panda 3 4 5 2 14 35 1 36
7 Pandemic 4 3 3 4 1 0 1
8 The Hateful Eight 7 8 4 9 29 19 48
9 The 5th Wave 5 7 5 8 13 7 20

In [19]:
# Learn about API authentication here: https://plot.ly/pandas/getting-started
# Find your api_key here: https://plot.ly/settings/api
# Cufflinks binds plotly to pandas dataframes in IPython notebook. Read more

#sudo pip install cufflinks
#sudo pip install plotly

import plotly.plotly as py
import cufflinks as cf
import pandas as pd

py.sign_in('rameshb', 'f8bhk20fsq')
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
Final_dataframe.iplot(kind='bubble', x='Character Typed_x', y='Order Number_x', size='Total Suggestions',
                      text='Movie Name', xTitle='Number of Characters at first occurrence',
                      yTitle='Position in suggestion list during first occurrence',
             filename='cufflinks/simple-bubble-chart')


Out[19]:

In [20]:
# Learn about API authentication here: https://plot.ly/pandas/getting-started
# Find your api_key here: https://plot.ly/settings/api
# Cufflinks binds plotly to pandas dataframes in IPython notebook. Read more

#sudo pip install cufflinks
#sudo pip install plotly

import plotly.plotly as py
import cufflinks as cf
import pandas as pd

py.sign_in('rameshb', 'f8bhk20fsq')
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
Final_dataframe.iplot(kind='bubble', x='Character Typed_y', y='Order Number_y', size='Total Suggestions',
                      text='Movie Name', xTitle='Number of Characters used during top position',
                      yTitle='Top position suggested',
             filename='cufflinks/simple-bubble-chart')


Out[20]:

In [21]:
Final_plot= Final_dataframe.rename(columns={'Character Typed_x': 'char_x', 'Character Typed_y': 'char_y',
                              'Order Number_x':'order_x','Order Number_y':'order_y','Movie Name':'movie_name'})
Final_plot


Out[21]:
movie_name char_x order_x char_y order_y Total Bing Suggestion Total Google Suggestion Total Suggestions
0 Star Wars: The Force Awakens 5 10 18 2 12 35 47
1 Deadpool 4 6 5 3 19 4 23
2 Batman v Superman: Dawn of Justice 4 8 24 2 63 52 115
3 The Revenant 7 5 7 4 9 5 14
4 Ice Age: The Great Egg-Scapade 10 8 22 4 4 22 26
5 Hail Caesar! 6 6 6 6 6 0 6
6 Kung Fu Panda 3 4 5 14 2 35 1 36
7 Pandemic 4 3 4 3 1 0 1
8 The Hateful Eight 7 8 9 4 29 19 48
9 The 5th Wave 5 7 8 5 13 7 20

In [22]:
fig = {
    'data': [
        {'x': Final_plot.char_x, 'y': Final_plot.order_x, 'text': Final_plot.movie_name, 
         'mode': 'markers', 'name': 'Lowest Character VS Position'},
        {'x': Final_plot.char_y, 'y': Final_plot.order_y, 'text': Final_plot.movie_name, 
         'mode': 'markers', 'name': 'Highest Position VS Character'}
    ],
    'layout': {
        'xaxis': {'title': 'Number of Characters', 'type': 'log'},
        'yaxis': {'title': "Position"}
    }
}
py.iplot(fig, filename='cufflinks/multiple-scatter')


Out[22]: