In [8]:
import csv
import pandas as pd
from webcrawl import WebCrawl
Reading filtered dataset
In [9]:
movie = pd.read_csv('datasetWithoutBoxOffice.csv')
movie.head(3)
Out[9]:
List with
In [11]:
listWithBoxOffice = [[],[]]
Function to extract Box Office by Web Crawling
In [12]:
def extract_boxOffice(tmdbid,imdbid):
# Extract Box office by crawling IMDB page using IMDB ID
boxOffice = WebCrawl().extractBoxOfficeByIMDB(imdbid)
if boxOffice == 'N/A':
# If 'N/A' in box office, crawl TMDB web page by using TMDB ID
boxOffice = WebCrawl().extractBoxOfficeByTMDB(str(tmdbid))
if boxOffice != 'N/A':
listWithBoxOffice[0].append(imdbid)
listWithBoxOffice[1].append(boxOffice)
else:
listWithBoxOffice[0].append(imdbid)
# Appending NaN to Box office so the column will have uniform data type (float)
listWithBoxOffice[1].append(float('nan'))
Optional(but suggested):
In [18]:
for movieID in movie.values[0:1000]:
# parameters TMDB ID, IMDB ID
extract_boxOffice(movieID[0], movieID[1])
Creating a csv file with 1000 entries
In [ ]:
with open('movie_with_boxoffice_1000.csv', 'w') as csvfile:
fieldnames = ['IMDB ID','BOX OFFICE']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for i in range(len(listWithBoxOffice[0])):
writer.writerow({'IMDB ID': listWithBoxOffice[0][i], 'BOX OFFICE': listWithBoxOffice[1][i]})
In [ ]: