In [13]:
import wikipedia
import re
import numpy as np
import pandas as pd

In [14]:
# read in list of movies 
df = pd.read_csv('movie.csv', encoding = "ISO-8859-1")

In [15]:
df.head()


Out[15]:
title year
0 Dead Awake 2016
1 Krish Trish and Baltiboy - Battle of Wits 2013
2 Krish Trish and Baltiboy - Best Friends Forever 2016
3 Krish Trish and Baltiboy - Comics of India 2012
4 Krish Trish and Baltiboy - Oversmartness Never... 2017

In [ ]:
data = pd.DataFrame()
titles = []
original_titles = []
content_list = []
#for title in title_list:
for title in df.loc[0:1000,'title']:
    regex = re.compile(re.compile(re.escape(title)+r' \(\d*\s*film\)|'+re.escape(title)+r'$'))
    regex_match = np.vectorize(regex.match)
    try: 
        search_list = wikipedia.search(title+' film')
        movie_list = regex_match(search_list)
    except IndexError:
        continue
    match_list = []
    for i in range(movie_list.shape[0]):
        if movie_list[i] is not None:
            match_list.append(movie_list[i].group(0))
    titles.extend(match_list)
    for name in match_list:
        try:
            page=wikipedia.page(name)
        except:
            content_list.append('')
            continue
        content = page.content
        content_list.append(content)
    original_titles.extend(np.repeat(title,len(match_list)))

In [17]:
data['title'] = original_titles
data['wiki_title'] = titles
data['content'] = content_list

In [19]:
data.head()


Out[19]:
title wiki_title content
0 Dead Awake Dead Awake (2016 film) Dead Awake is a 2016 American supernatural psy...
1 Dead Awake Dead Awake (2010 film) Dead Awake is a 2010 mystery film starring Nic...
2 Dead Awake Dead Awake (2001 film) Dead Awake is a 2001 mystery film starring Ste...
3 Dead Awake Dead Awake
4 Mighty Raju Rio Calling Mighty Raju Rio Calling Mighty Raju: Rio Calling is an Indian animated...

In [20]:
#data.to_csv('reviews_wiki_0_1000_v2.csv')

In [ ]: