In [1]:
import csv
import pandas as pd
from webcrawl import WebCrawl
Reading filtered dataset
In [2]:
movie = pd.read_csv('datasetWithoutBoxOffice.csv')
movie.head(3)
Out[2]:
List with
In [3]:
listWithBoxOffice = [[],[]]
Function to extract Box Office by Web Crawling
In [4]:
def extract_boxOffice(tmdbid,imdbid):
# Extract Box office by crawling IMDB page using IMDB ID
boxOffice = WebCrawl().extractBoxOfficeByIMDB(imdbid)
if boxOffice == 'N/A':
# If 'N/A' in box office, crawl TMDB web page by using TMDB ID
boxOffice = WebCrawl().extractBoxOfficeByTMDB(str(tmdbid))
if boxOffice != 'N/A':
listWithBoxOffice[0].append(imdbid)
listWithBoxOffice[1].append(boxOffice)
else:
listWithBoxOffice[0].append(imdbid)
# Appending NaN to Box office so the column will have uniform data type (float)
listWithBoxOffice[1].append(float('nan'))
Optional(but suggested):
In [6]:
for movieID in movie.values[:]:
# parameters TMDB ID, IMDB ID
extract_boxOffice(movieID[0], movieID[1])
Creating a csv file for all entries
In [ ]:
with open('boxoffice.csv', 'w') as csvfile:
fieldnames = ['IMDB ID','BOX OFFICE']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for i in range(len(listWithBoxOffice[0])):
writer.writerow({'IMDB ID': listWithBoxOffice[0][i], 'BOX OFFICE': listWithBoxOffice[1][i]})
Reading csv's into dataframes
In [ ]:
boxoffice = pd.read_csv('boxoffice.csv')
datasetWithouBoxOffice = pd.read_csv('datasetWithoutBoxOffice.csv')
Merge using IMDB ID as key
In [ ]:
result = datasetWithouBoxOffice.merge(boxoffice, left_on='IMDB ID', right_on='IMDB ID', how = 'inner' )
Converting Dataframe to csv
In [ ]:
result.to_csv('datasetWithBoxoffice.csv',index = False)