In [4]:
import re
In [3]:
s1="The Lazarus Project (2008)"
s2="The Cry (Il Grido) (1957)"
s3="A Hell of a Day (Reines d'un jour) (2001)"
In [79]:
def format(original_title_with_leading_article):
articles = ["The","A","An"]
# e.g.: "The Lazarus Project (2008)"
pat_no_subtitle = r'^([^\(]+) (\(\d{4}\))$'
# e.g. "A Hell of a Day (Reines d'un jour) (2001)"
pat_with_subtitle = r'^([^\(]+) (\([^\(]+\)) (\(\d{4}\))$'
pat_first_word_w_space = r'^(\w+ )(.+)$'
for article in articles:
if original_title_with_leading_article.startswith(article):
# option 1 - no subtitles
match = re.match(pat_no_subtitle,original_title_with_leading_article)
if match:
title = match.group(1)
year = match.group(2)
first_word = re.match(pat_first_word_w_space,title).group(1)
title_no_article = re.sub(pat_first_word_w_space,r'\2',title)
title_article_in_front = title_no_article+", "+first_word
formatted_title = title_article_in_front+year
return '"'+formatted_title+'"'
# option 2 - with subtitles
match = re.match(pat_with_subtitle,original_title_with_leading_article)
if match:
main_title = match.group(1)
subtitle = match.group(2)
year = match.group(3)
first_word = re.match(pat_first_word_w_space,main_title).group(1)
main_title_no_article = re.sub(pat_first_word_w_space,r'\2',main_title)
main_title_article_in_front = main_title_no_article+", "+first_word
formatted_title = main_title_article_in_front+subtitle+" "+ year
return '"'+formatted_title+'"'
# no match, return original
return original_title_with_leading_article
In [81]:
format(s2)
Out[81]:
In [ ]:
In [ ]: