In [13]:
import wikipedia
import re
import numpy as np
import pandas as pd
In [14]:
# read in list of movies
df = pd.read_csv('movie.csv', encoding = "ISO-8859-1")
In [15]:
df.head()
Out[15]:
In [ ]:
data = pd.DataFrame()
titles = []
original_titles = []
content_list = []
#for title in title_list:
for title in df.loc[0:1000,'title']:
regex = re.compile(re.compile(re.escape(title)+r' \(\d*\s*film\)|'+re.escape(title)+r'$'))
regex_match = np.vectorize(regex.match)
try:
search_list = wikipedia.search(title+' film')
movie_list = regex_match(search_list)
except IndexError:
continue
match_list = []
for i in range(movie_list.shape[0]):
if movie_list[i] is not None:
match_list.append(movie_list[i].group(0))
titles.extend(match_list)
for name in match_list:
try:
page=wikipedia.page(name)
except:
content_list.append('')
continue
content = page.content
content_list.append(content)
original_titles.extend(np.repeat(title,len(match_list)))
In [17]:
data['title'] = original_titles
data['wiki_title'] = titles
data['content'] = content_list
In [19]:
data.head()
Out[19]:
In [20]:
#data.to_csv('reviews_wiki_0_1000_v2.csv')
In [ ]: