Search Engine's AutoSuggestions impact of Movie Piracy

Final Project for JOUR 479/779 Understanding search enginee autosuggestions impact on Movie Piracy



In [9]:

    
import requests
import urllib
import warnings
import pandas as pd
import datetime
import os,glob
import csv
import io
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

#Constant
def MY_FILE():
    return "Data/May_2_week/"



In [10]:

    
if not os.path.exists(MY_FILE()):
    os.makedirs(MY_FILE())
    
for filename in glob.glob(MY_FILE()+"OP_*"):
    os.remove(filename)



In [11]:

    
# ----------------------------------------------------------------------------------------------------------------
# collect_autosuggestions
#
# parameters:
# "source" is either "google" or "bing"
# "tld" stands for "top level domain" and can be any of the 2-letter country codes listed 
#  here where google operates: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
# "lang" is the language of the suggestions returned, should be two letter codes 
# from here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
# "query" is the query that you would like to see autocompleted
# ----------------------------------------------------------------------------------------------------------------

def collect_autosuggestions(source, tld, lang, query):
    if source == "google":
        # Some info on this api: http://shreyaschand.com/blog/2013/01/03/google-autocomplete-api/
        url = 'http://www.google.'+tld+'/complete/search?&client=firefox&%s' % (urllib.parse.urlencode(
                {'q': query.encode('utf-8'), 'hl': lang}))
       
    elif source == "bing":
        # Note: for Bing the language is controlled by the tld, so the lang parameter will have no effect on its own
        url = 'http://api.bing.com/osjson.aspx?%s' % (urllib.parse.urlencode(
                {'query': query.encode('utf-8'), 'cc': tld}))
   
    r = requests.get(url)
    suggestions = r.json()[1]
    return suggestions



In [12]:

    
movies_List = pd.read_csv("Final/TorrentFreak_April_16.csv",encoding = "ISO-8859-1",low_memory=False,
                          names=["Ranking", 'Last Week Ranking','Movie Title','Movie Rating'])
movies_List









    Out[12]:






  
    
      
      Ranking
      Last Week Ranking
      Movie Title
      Movie Rating
    
  
  
    
      0
      1
      -1
      Star Wars: The Force Awakens
      8.3
    
    
      1
      2
      -2
      Deadpool
      8.6
    
    
      2
      3
      -9
      Batman v Superman: Dawn of Justice
      7.5
    
    
      3
      4
      -4
      The Revenant
      8.2
    
    
      4
      5
      -3
      Ice Age: The Great Egg-Scapade
      6.1
    
    
      5
      6
      0
      Hail Caesar!
      6.8
    
    
      6
      7
      -6
      Kung Fu Panda 3
      8.0
    
    
      7
      8
      0
      Pandemic
      4.6
    
    
      8
      9
      -8
      The Hateful Eight
      8.0
    
    
      9
      10
      -5
      The 5th Wave
      5.4

The Top Priated Movie According to Torrent Freak



In [13]:

    
#Method to collecte complete suggestions for the movie name
def collect_complete_Suggestions(search_term):
    #print ("Analysing Search Enginee Suggestion for {}".format(search_term))
    
    suggestions_google = collect_autosuggestions("google", "com", "en", search_term)    
    suggestions_bing = collect_autosuggestions("bing", "com", "en", search_term)    

    suggestions_google_df = pd.DataFrame({"suggestion": suggestions_google})
    suggestions_google_df["Movie Name"] = search_term
    suggestions_google_df["Search Enginee"] = "Google"
    suggestions_google_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
    suggestions_google_df["Order Number"]=suggestions_google_df.index+1
    
    suggestions_bing_df = pd.DataFrame({"suggestion": suggestions_bing})
    suggestions_bing_df["Movie Name"] = search_term
    suggestions_bing_df["Search Enginee"] = "Bing"
    suggestions_bing_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
    suggestions_bing_df["Order Number"]=suggestions_bing_df.index+1

    suggestions_bing_df = suggestions_bing_df[suggestions_bing_df.index < 10]

    suggestions_df = suggestions_google_df.append(suggestions_bing_df, ignore_index=True)
    
    suggestions_df = suggestions_df[["Order Number",'Search Enginee',"Movie Name","suggestion","datetime"]]
    suggestions_df.to_csv(MY_FILE()+'OP_Movie_Search_Suggestions.csv',mode = 'a',header=False,index=False)
    
    #print ("====================Analysis Completed=============================")
#End of Function



In [14]:

    
#Method to collecte detail suggestions for the movie name
def collect_detail_Suggestions(movie_name):
    
    #print ("Analysing Detailed Search Enginee Suggestion for {}".format(movie_name))
     #Writing Headers
    file_name = MY_FILE()+'OP_'+movie_name+'.csv'
    with open(file_name, 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Order Number",'Search Enginee', "Movie Name","Sub Term","Suggestion","Date & Time",'Character Typed']]
        a.writerows(data)
    
    for i, character in enumerate(movie_name):
        if i > 3:
            title_Name=movie_name
            title = title_Name[0:i]
            suggestions_google = collect_autosuggestions("google", "com", "en", title)    
            suggestions_bing = collect_autosuggestions("bing", "com", "en", title)    

            suggestions_google_df = pd.DataFrame({"suggestion": suggestions_google})
            suggestions_google_df["Movie Name"] = movie_name
            suggestions_google_df["Sub Term"] = title
            suggestions_google_df["Search Enginee"] = "Google"
            suggestions_google_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
            suggestions_google_df["Order Number"]=suggestions_google_df.index+1
            suggestions_google_df["Character_Typed"]=i

            
            suggestions_bing_df = pd.DataFrame({"suggestion": suggestions_bing})
            suggestions_bing_df["Movie Name"] = movie_name
            suggestions_bing_df["Sub Term"] = title
            suggestions_bing_df["Search Enginee"] = "Bing"
            suggestions_bing_df["datetime"] = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')
            suggestions_bing_df["Order Number"]=suggestions_bing_df.index+1
            suggestions_bing_df["Character_Typed"]=i


            suggestions_bing_df = suggestions_bing_df[suggestions_bing_df.index < 10]
            
            suggestions_df = suggestions_google_df.append(suggestions_bing_df, ignore_index=True) 
            suggestions_df = suggestions_df[["Order Number",'Search Enginee', "Movie Name","Sub Term","suggestion","datetime"
                                            ,'Character_Typed']]
            suggestions_df.to_csv(file_name,mode = 'a',header=False,index=False)            
    #print ("=================================================")   
#End of Function



In [15]:

    
def perform_detailed_analysis(movie_name):
    detailed_List = pd.read_csv(MY_FILE()+'OP_'+movie_name+'.csv')
    summary_list = detailed_List.groupby(['Search Enginee', 'Suggestion']).size().reset_index(name='occurrence')
    summary_list = summary_list.sort_index(by=['occurrence'], ascending=[False])
    
    with open(MY_FILE()+'OP_'+movie_name+'_Summary.csv', 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Search Enginee",'Suggestion',"Occurrence"]]
        a.writerows(data)
    summary_list.to_csv(MY_FILE()+'OP_'+movie_name+'_Summary.csv',mode = 'a',header=False,index=False)  
#End of Function



In [16]:

    
def filter_piracy_terms(movie_name):
    detail_movie_data = pd.read_csv(MY_FILE()+'OP_'+movie_name+'.csv')
    piracy_tag = "online|full movie|torrent|putlocker|free"
    filtered_List = detail_movie_data[detail_movie_data['Suggestion'].str.contains(piracy_tag)==True]
    filtered_List = filtered_List.sort_index(by=['Character Typed'], ascending=[True])
    
    with open(MY_FILE()+'OP_'+movie_name+'_Filtered.csv', 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Order Number",'Search Enginee', "Movie Name","Sub Term","Suggestion","Date & Time",'Character Typed']]
        a.writerows(data)
        
    filtered_List.to_csv(MY_FILE()+'OP_'+movie_name+'_Filtered.csv',mode = 'a',header=False,index=False)
    
    summary_movie_data = pd.read_csv(MY_FILE()+'OP_'+movie_name+'_Summary.csv')
    Summary_filtered_List = summary_movie_data[summary_movie_data['Suggestion'].str.contains(piracy_tag)==True] 
    
    with open(MY_FILE()+'OP_'+movie_name+'_Summary_Filtered.csv', 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Search Enginee",'Suggestion',"Occurrence"]]
        a.writerows(data)
    
    Summary_filtered_List.to_csv(MY_FILE()+'OP_'+movie_name+'_Summary_Filtered.csv',mode = 'a',header=False,index=False)  
#End of Function



In [17]:

    
#Looping through each movies to anaylse search enginee suggestions
#Writing Headers
with open(MY_FILE()+'OP_Movie_Search_Suggestions.csv', 'w+', newline='') as fp:
    a = csv.writer(fp, delimiter=',')
    data = [["Order Number",'Search Enginee',"Movie Name","Suggestion","Date & Time"]]
    a.writerows(data)
    
for index, row in movies_List.iterrows():
    print ("Analysis For {} ".format(row['Movie Title']))
    collect_complete_Suggestions(row['Movie Title'])
    collect_detail_Suggestions(row['Movie Title'])
    perform_detailed_analysis(row['Movie Title'])
    filter_piracy_terms(row['Movie Title'])
    print ("Analysis Completed")









    



Analysis For Star Wars: The Force Awakens 
Analysis Completed
Analysis For Deadpool 
Analysis Completed
Analysis For Batman v Superman: Dawn of Justice 
Analysis Completed
Analysis For The Revenant 
Analysis Completed
Analysis For Ice Age: The Great Egg-Scapade 
Analysis Completed
Analysis For Hail Caesar! 
Analysis Completed
Analysis For Kung Fu Panda 3 
Analysis Completed
Analysis For Pandemic 
Analysis Completed
Analysis For The Hateful Eight 
Analysis Completed
Analysis For The 5th Wave 
Analysis Completed



In [18]:

    
first_occurrence = pd.DataFrame()
least_order = pd.DataFrame()

for index, row in movies_List.iterrows():
    Movie_Name = row['Movie Title']
    filtered_data = pd.read_csv(MY_FILE()+'OP_'+Movie_Name+'_Filtered.csv')
    
    first_row = filtered_data.head(1)
    first_occurrence = first_occurrence.append(first_row, ignore_index=True) 
    
    filtered_data = filtered_data.sort_index(by=['Order Number'], ascending=[True])
    first_row = filtered_data.head(1)
    #least_order = least_order.append(first_row[["Movie Name","Order Number"]], ignore_index=True) 
    
    filtered_summary_data = pd.read_csv(MY_FILE()+'OP_'+Movie_Name+'_Summary_Filtered.csv')
    filtered_summary_data = filtered_summary_data.groupby(by=['Search Enginee'])['Occurrence'].sum().reset_index(name='Total')
    if filtered_summary_data.empty:        
        print ("No Data for the movie:{} ".format(Movie_Name))  
        first_row.loc[len(first_row)]=['0','0',Movie_Name,'0','0','0','0']
        first_row["Total Bing Suggestion"] = 0
        first_row["Total Google Suggestion"] = 0
        least_order = least_order.append(first_row[["Movie Name","Character Typed","Order Number","Total Bing Suggestion"
                                                    ,"Total Google Suggestion"]], ignore_index=True) 
        first_occurrence = first_occurrence.append(first_row, ignore_index=True) 
    else:        
        if(len(filtered_summary_data.index)==2):
            if filtered_summary_data.iloc[0]['Search Enginee'] == 'Bing':
                first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[0]['Total']
                first_row["Total Google Suggestion"] = filtered_summary_data.iloc[1]["Total"]
            else:
                first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[1]['Total']
                first_row["Total Google Suggestion"] = filtered_summary_data.iloc[0]["Total"]
        else:
            if filtered_summary_data.iloc[0]['Search Enginee'] == 'Bing':
                first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[0]['Total']
                first_row["Total Google Suggestion"] = 0
            else:
                first_row["Total Bing Suggestion"] = 0
                first_row["Total Google Suggestion"] = filtered_summary_data.iloc[0]["Total"]

        least_order = least_order.append(first_row[["Movie Name","Character Typed","Order Number","Total Bing Suggestion"
                                                    ,"Total Google Suggestion"]], ignore_index=True) 
    
first_occurrence = first_occurrence[["Movie Name","Character Typed","Order Number"]]
Final_dataframe = first_occurrence.merge(least_order,on='Movie Name')
Final_dataframe["Total Suggestions"] = Final_dataframe["Total Bing Suggestion"] + Final_dataframe["Total Google Suggestion"]
Final_dataframe[["Movie Name","Character Typed_x","Order Number_x","Order Number_y","Character Typed_y",
                "Total Bing Suggestion","Total Google Suggestion","Total Suggestions"]]









    Out[18]:






  
    
      
      Movie Name
      Character Typed_x
      Order Number_x
      Order Number_y
      Character Typed_y
      Total Bing Suggestion
      Total Google Suggestion
      Total Suggestions
    
  
  
    
      0
      Star Wars: The Force Awakens
      5
      10
      2
      18
      12
      35
      47
    
    
      1
      Deadpool
      4
      6
      3
      5
      19
      4
      23
    
    
      2
      Batman v Superman: Dawn of Justice
      4
      8
      2
      24
      63
      52
      115
    
    
      3
      The Revenant
      7
      5
      4
      7
      9
      5
      14
    
    
      4
      Ice Age: The Great Egg-Scapade
      10
      8
      4
      22
      4
      22
      26
    
    
      5
      Hail Caesar!
      6
      6
      6
      6
      6
      0
      6
    
    
      6
      Kung Fu Panda 3
      4
      5
      2
      14
      35
      1
      36
    
    
      7
      Pandemic
      4
      3
      3
      4
      1
      0
      1
    
    
      8
      The Hateful Eight
      7
      8
      4
      9
      29
      19
      48
    
    
      9
      The 5th Wave
      5
      7
      5
      8
      13
      7
      20



In [19]:

    
# Learn about API authentication here: https://plot.ly/pandas/getting-started
# Find your api_key here: https://plot.ly/settings/api
# Cufflinks binds plotly to pandas dataframes in IPython notebook. Read more

#sudo pip install cufflinks
#sudo pip install plotly

import plotly.plotly as py
import cufflinks as cf
import pandas as pd

py.sign_in('rameshb', 'f8bhk20fsq')
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
Final_dataframe.iplot(kind='bubble', x='Character Typed_x', y='Order Number_x', size='Total Suggestions',
                      text='Movie Name', xTitle='Number of Characters at first occurrence',
                      yTitle='Position in suggestion list during first occurrence',
             filename='cufflinks/simple-bubble-chart')









    Out[19]:



In [20]:

    
# Learn about API authentication here: https://plot.ly/pandas/getting-started
# Find your api_key here: https://plot.ly/settings/api
# Cufflinks binds plotly to pandas dataframes in IPython notebook. Read more

#sudo pip install cufflinks
#sudo pip install plotly

import plotly.plotly as py
import cufflinks as cf
import pandas as pd

py.sign_in('rameshb', 'f8bhk20fsq')
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
Final_dataframe.iplot(kind='bubble', x='Character Typed_y', y='Order Number_y', size='Total Suggestions',
                      text='Movie Name', xTitle='Number of Characters used during top position',
                      yTitle='Top position suggested',
             filename='cufflinks/simple-bubble-chart')









    Out[20]:



In [21]:

    
Final_plot= Final_dataframe.rename(columns={'Character Typed_x': 'char_x', 'Character Typed_y': 'char_y',
                              'Order Number_x':'order_x','Order Number_y':'order_y','Movie Name':'movie_name'})
Final_plot









    Out[21]:






  
    
      
      movie_name
      char_x
      order_x
      char_y
      order_y
      Total Bing Suggestion
      Total Google Suggestion
      Total Suggestions
    
  
  
    
      0
      Star Wars: The Force Awakens
      5
      10
      18
      2
      12
      35
      47
    
    
      1
      Deadpool
      4
      6
      5
      3
      19
      4
      23
    
    
      2
      Batman v Superman: Dawn of Justice
      4
      8
      24
      2
      63
      52
      115
    
    
      3
      The Revenant
      7
      5
      7
      4
      9
      5
      14
    
    
      4
      Ice Age: The Great Egg-Scapade
      10
      8
      22
      4
      4
      22
      26
    
    
      5
      Hail Caesar!
      6
      6
      6
      6
      6
      0
      6
    
    
      6
      Kung Fu Panda 3
      4
      5
      14
      2
      35
      1
      36
    
    
      7
      Pandemic
      4
      3
      4
      3
      1
      0
      1
    
    
      8
      The Hateful Eight
      7
      8
      9
      4
      29
      19
      48
    
    
      9
      The 5th Wave
      5
      7
      8
      5
      13
      7
      20



In [22]:

    
fig = {
    'data': [
        {'x': Final_plot.char_x, 'y': Final_plot.order_x, 'text': Final_plot.movie_name, 
         'mode': 'markers', 'name': 'Lowest Character VS Position'},
        {'x': Final_plot.char_y, 'y': Final_plot.order_y, 'text': Final_plot.movie_name, 
         'mode': 'markers', 'name': 'Highest Position VS Character'}
    ],
    'layout': {
        'xaxis': {'title': 'Number of Characters', 'type': 'log'},
        'yaxis': {'title': "Position"}
    }
}
py.iplot(fig, filename='cufflinks/multiple-scatter')









    Out[22]:

	Ranking	Last Week Ranking	Movie Title	Movie Rating
0	1	-1	Star Wars: The Force Awakens	8.3
1	2	-2	Deadpool	8.6
2	3	-9	Batman v Superman: Dawn of Justice	7.5
3	4	-4	The Revenant	8.2
4	5	-3	Ice Age: The Great Egg-Scapade	6.1
5	6	0	Hail Caesar!	6.8
6	7	-6	Kung Fu Panda 3	8.0
7	8	0	Pandemic	4.6
8	9	-8	The Hateful Eight	8.0
9	10	-5	The 5th Wave	5.4