Search Engine's AutoSuggestions impact of Movie Piracy

Final Project for JOUR 479/779 Understanding search enginee autosuggestions impact on Movie Piracy

import requests
import urllib
import warnings
import pandas as pd
import datetime
import os,glob
import csv
import io
import numpy as np
import matplotlib
import matplotlib.pyplot as plt


def MY_FILE():
    return "Data/May_2_week/"

if not os.path.exists(MY_FILE()):
for filename in glob.glob(MY_FILE()+"OP_*"):

# ----------------------------------------------------------------------------------------------------------------
# collect_autosuggestions
# parameters:
# "source" is either "google" or "bing"
# "tld" stands for "top level domain" and can be any of the 2-letter country codes listed 
#  here where google operates:
# "lang" is the language of the suggestions returned, should be two letter codes 
# from here:
# "query" is the query that you would like to see autocompleted
# ----------------------------------------------------------------------------------------------------------------

def collect_autosuggestions(source, tld, lang, query):
    if source == "google":
        # Some info on this api:
        url = ''+tld+'/complete/search?&client=firefox&%s' % (urllib.parse.urlencode(
                {'q': query.encode('utf-8'), 'hl': lang}))
    elif source == "bing":
        # Note: for Bing the language is controlled by the tld, so the lang parameter will have no effect on its own
        url = '' % (urllib.parse.urlencode(
                {'query': query.encode('utf-8'), 'cc': tld}))
    r = requests.get(url)
    suggestions = r.json()[1]
    return suggestions

movies_List = pd.read_csv("Final/TorrentFreak_April_16.csv",encoding = "ISO-8859-1",low_memory=False,
                          names=["Ranking", 'Last Week Ranking','Movie Title','Movie Rating'])

Ranking Last Week Ranking Movie Title Movie Rating
0 1 -1 Star Wars: The Force Awakens 8.3
1 2 -2 Deadpool 8.6
2 3 -9 Batman v Superman: Dawn of Justice 7.5
3 4 -4 The Revenant 8.2
4 5 -3 Ice Age: The Great Egg-Scapade 6.1
5 6 0 Hail Caesar! 6.8
6 7 -6 Kung Fu Panda 3 8.0
7 8 0 Pandemic 4.6
8 9 -8 The Hateful Eight 8.0
9 10 -5 The 5th Wave 5.4

The Top Priated Movie According to Torrent Freak

#Method to collecte complete suggestions for the movie name
def collect_complete_Suggestions(search_term):
    #print ("Analysing Search Enginee Suggestion for {}".format(search_term))
    suggestions_google = collect_autosuggestions("google", "com", "en", search_term)    
    suggestions_bing = collect_autosuggestions("bing", "com", "en", search_term)    

    suggestions_google_df = pd.DataFrame({"suggestion": suggestions_google})
    suggestions_google_df["Movie Name"] = search_term
    suggestions_google_df["Search Enginee"] = "Google"
    suggestions_google_df["datetime"] = datetime.datetime.strftime(, '%Y-%m-%d %H:%M:%S')
    suggestions_google_df["Order Number"]=suggestions_google_df.index+1
    suggestions_bing_df = pd.DataFrame({"suggestion": suggestions_bing})
    suggestions_bing_df["Movie Name"] = search_term
    suggestions_bing_df["Search Enginee"] = "Bing"
    suggestions_bing_df["datetime"] = datetime.datetime.strftime(, '%Y-%m-%d %H:%M:%S')
    suggestions_bing_df["Order Number"]=suggestions_bing_df.index+1

    suggestions_bing_df = suggestions_bing_df[suggestions_bing_df.index < 10]

    suggestions_df = suggestions_google_df.append(suggestions_bing_df, ignore_index=True)
    suggestions_df = suggestions_df[["Order Number",'Search Enginee',"Movie Name","suggestion","datetime"]]
    suggestions_df.to_csv(MY_FILE()+'OP_Movie_Search_Suggestions.csv',mode = 'a',header=False,index=False)
    #print ("====================Analysis Completed=============================")
#End of Function

#Method to collecte detail suggestions for the movie name
def collect_detail_Suggestions(movie_name):
    #print ("Analysing Detailed Search Enginee Suggestion for {}".format(movie_name))
     #Writing Headers
    file_name = MY_FILE()+'OP_'+movie_name+'.csv'
    with open(file_name, 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Order Number",'Search Enginee', "Movie Name","Sub Term","Suggestion","Date & Time",'Character Typed']]
    for i, character in enumerate(movie_name):
        if i > 3:
            title = title_Name[0:i]
            suggestions_google = collect_autosuggestions("google", "com", "en", title)    
            suggestions_bing = collect_autosuggestions("bing", "com", "en", title)    

            suggestions_google_df = pd.DataFrame({"suggestion": suggestions_google})
            suggestions_google_df["Movie Name"] = movie_name
            suggestions_google_df["Sub Term"] = title
            suggestions_google_df["Search Enginee"] = "Google"
            suggestions_google_df["datetime"] = datetime.datetime.strftime(, '%Y-%m-%d %H:%M:%S')
            suggestions_google_df["Order Number"]=suggestions_google_df.index+1

            suggestions_bing_df = pd.DataFrame({"suggestion": suggestions_bing})
            suggestions_bing_df["Movie Name"] = movie_name
            suggestions_bing_df["Sub Term"] = title
            suggestions_bing_df["Search Enginee"] = "Bing"
            suggestions_bing_df["datetime"] = datetime.datetime.strftime(, '%Y-%m-%d %H:%M:%S')
            suggestions_bing_df["Order Number"]=suggestions_bing_df.index+1

            suggestions_bing_df = suggestions_bing_df[suggestions_bing_df.index < 10]
            suggestions_df = suggestions_google_df.append(suggestions_bing_df, ignore_index=True) 
            suggestions_df = suggestions_df[["Order Number",'Search Enginee', "Movie Name","Sub Term","suggestion","datetime"
            suggestions_df.to_csv(file_name,mode = 'a',header=False,index=False)            
    #print ("=================================================")   
#End of Function

def perform_detailed_analysis(movie_name):
    detailed_List = pd.read_csv(MY_FILE()+'OP_'+movie_name+'.csv')
    summary_list = detailed_List.groupby(['Search Enginee', 'Suggestion']).size().reset_index(name='occurrence')
    summary_list = summary_list.sort_index(by=['occurrence'], ascending=[False])
    with open(MY_FILE()+'OP_'+movie_name+'_Summary.csv', 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Search Enginee",'Suggestion',"Occurrence"]]
    summary_list.to_csv(MY_FILE()+'OP_'+movie_name+'_Summary.csv',mode = 'a',header=False,index=False)  
#End of Function

def filter_piracy_terms(movie_name):
    detail_movie_data = pd.read_csv(MY_FILE()+'OP_'+movie_name+'.csv')
    piracy_tag = "online|full movie|torrent|putlocker|free"
    filtered_List = detail_movie_data[detail_movie_data['Suggestion'].str.contains(piracy_tag)==True]
    filtered_List = filtered_List.sort_index(by=['Character Typed'], ascending=[True])
    with open(MY_FILE()+'OP_'+movie_name+'_Filtered.csv', 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Order Number",'Search Enginee', "Movie Name","Sub Term","Suggestion","Date & Time",'Character Typed']]
    filtered_List.to_csv(MY_FILE()+'OP_'+movie_name+'_Filtered.csv',mode = 'a',header=False,index=False)
    summary_movie_data = pd.read_csv(MY_FILE()+'OP_'+movie_name+'_Summary.csv')
    Summary_filtered_List = summary_movie_data[summary_movie_data['Suggestion'].str.contains(piracy_tag)==True] 
    with open(MY_FILE()+'OP_'+movie_name+'_Summary_Filtered.csv', 'w+', newline='') as fp:
        a = csv.writer(fp, delimiter=',')
        data = [["Search Enginee",'Suggestion',"Occurrence"]]
    Summary_filtered_List.to_csv(MY_FILE()+'OP_'+movie_name+'_Summary_Filtered.csv',mode = 'a',header=False,index=False)  
#End of Function

#Looping through each movies to anaylse search enginee suggestions
#Writing Headers
with open(MY_FILE()+'OP_Movie_Search_Suggestions.csv', 'w+', newline='') as fp:
    a = csv.writer(fp, delimiter=',')
    data = [["Order Number",'Search Enginee',"Movie Name","Suggestion","Date & Time"]]
for index, row in movies_List.iterrows():
    print ("Analysis For {} ".format(row['Movie Title']))
    collect_complete_Suggestions(row['Movie Title'])
    collect_detail_Suggestions(row['Movie Title'])
    perform_detailed_analysis(row['Movie Title'])
    filter_piracy_terms(row['Movie Title'])
    print ("Analysis Completed")

first_occurrence = pd.DataFrame()
least_order = pd.DataFrame()

for index, row in movies_List.iterrows():
    Movie_Name = row['Movie Title']
    filtered_data = pd.read_csv(MY_FILE()+'OP_'+Movie_Name+'_Filtered.csv')
    first_row = filtered_data.head(1)
    first_occurrence = first_occurrence.append(first_row, ignore_index=True) 
    filtered_data = filtered_data.sort_index(by=['Order Number'], ascending=[True])
    first_row = filtered_data.head(1)
    #least_order = least_order.append(first_row[["Movie Name","Order Number"]], ignore_index=True) 
    filtered_summary_data = pd.read_csv(MY_FILE()+'OP_'+Movie_Name+'_Summary_Filtered.csv')
    filtered_summary_data = filtered_summary_data.groupby(by=['Search Enginee'])['Occurrence'].sum().reset_index(name='Total')
    if filtered_summary_data.empty:        
        print ("No Data for the movie:{} ".format(Movie_Name))  
        first_row["Total Bing Suggestion"] = 0
        first_row["Total Google Suggestion"] = 0
        least_order = least_order.append(first_row[["Movie Name","Character Typed","Order Number","Total Bing Suggestion"
                                                    ,"Total Google Suggestion"]], ignore_index=True) 
        first_occurrence = first_occurrence.append(first_row, ignore_index=True) 
            if filtered_summary_data.iloc[0]['Search Enginee'] == 'Bing':
                first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[0]['Total']
                first_row["Total Google Suggestion"] = filtered_summary_data.iloc[1]["Total"]
                first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[1]['Total']
                first_row["Total Google Suggestion"] = filtered_summary_data.iloc[0]["Total"]
            if filtered_summary_data.iloc[0]['Search Enginee'] == 'Bing':
                first_row["Total Bing Suggestion"] = filtered_summary_data.iloc[0]['Total']
                first_row["Total Google Suggestion"] = 0
                first_row["Total Bing Suggestion"] = 0
                first_row["Total Google Suggestion"] = filtered_summary_data.iloc[0]["Total"]

        least_order = least_order.append(first_row[["Movie Name","Character Typed","Order Number","Total Bing Suggestion"
                                                    ,"Total Google Suggestion"]], ignore_index=True) 
first_occurrence = first_occurrence[["Movie Name","Character Typed","Order Number"]]
Final_dataframe = first_occurrence.merge(least_order,on='Movie Name')
Final_dataframe["Total Suggestions"] = Final_dataframe["Total Bing Suggestion"] + Final_dataframe["Total Google Suggestion"]
Final_dataframe[["Movie Name","Character Typed_x","Order Number_x","Order Number_y","Character Typed_y",
                "Total Bing Suggestion","Total Google Suggestion","Total Suggestions"]]

Movie Name Character Typed_x Order Number_x Order Number_y Character Typed_y Total Bing Suggestion Total Google Suggestion Total Suggestions
0 Star Wars: The Force Awakens 5 10 2 18 12 35 47
1 Deadpool 4 6 3 5 19 4 23
2 Batman v Superman: Dawn of Justice 4 8 2 24 63 52 115
3 The Revenant 7 5 4 7 9 5 14
4 Ice Age: The Great Egg-Scapade 10 8 4 22 4 22 26
5 Hail Caesar! 6 6 6 6 6 0 6
6 Kung Fu Panda 3 4 5 2 14 35 1 36
7 Pandemic 4 3 3 4 1 0 1
8 The Hateful Eight 7 8 4 9 29 19 48
9 The 5th Wave 5 7 5 8 13 7 20

# Learn about API authentication here:
# Find your api_key here:
# Cufflinks binds plotly to pandas dataframes in IPython notebook. Read more

#sudo pip install cufflinks
#sudo pip install plotly

import plotly.plotly as py
import cufflinks as cf
import pandas as pd

py.sign_in('rameshb', 'f8bhk20fsq')
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
Final_dataframe.iplot(kind='bubble', x='Character Typed_x', y='Order Number_x', size='Total Suggestions',
                      text='Movie Name', xTitle='Number of Characters at first occurrence',
                      yTitle='Position in suggestion list during first occurrence',


# Learn about API authentication here:
# Find your api_key here:
# Cufflinks binds plotly to pandas dataframes in IPython notebook. Read more

#sudo pip install cufflinks
#sudo pip install plotly

import plotly.plotly as py
import cufflinks as cf
import pandas as pd

py.sign_in('rameshb', 'f8bhk20fsq')
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
Final_dataframe.iplot(kind='bubble', x='Character Typed_y', y='Order Number_y', size='Total Suggestions',
                      text='Movie Name', xTitle='Number of Characters used during top position',
                      yTitle='Top position suggested',


Final_plot= Final_dataframe.rename(columns={'Character Typed_x': 'char_x', 'Character Typed_y': 'char_y',
                              'Order Number_x':'order_x','Order Number_y':'order_y','Movie Name':'movie_name'})

movie_name char_x order_x char_y order_y Total Bing Suggestion Total Google Suggestion Total Suggestions
0 Star Wars: The Force Awakens 5 10 18 2 12 35 47
1 Deadpool 4 6 5 3 19 4 23
2 Batman v Superman: Dawn of Justice 4 8 24 2 63 52 115
3 The Revenant 7 5 7 4 9 5 14
4 Ice Age: The Great Egg-Scapade 10 8 22 4 4 22 26
5 Hail Caesar! 6 6 6 6 6 0 6
6 Kung Fu Panda 3 4 5 14 2 35 1 36
7 Pandemic 4 3 4 3 1 0 1
8 The Hateful Eight 7 8 9 4 29 19 48
9 The 5th Wave 5 7 8 5 13 7 20

fig = {
    'data': [
        {'x': Final_plot.char_x, 'y': Final_plot.order_x, 'text': Final_plot.movie_name, 
         'mode': 'markers', 'name': 'Lowest Character VS Position'},
        {'x': Final_plot.char_y, 'y': Final_plot.order_y, 'text': Final_plot.movie_name, 
         'mode': 'markers', 'name': 'Highest Position VS Character'}
    'layout': {
        'xaxis': {'title': 'Number of Characters', 'type': 'log'},
        'yaxis': {'title': "Position"}
py.iplot(fig, filename='cufflinks/multiple-scatter')
