In [44]:
import re
import requests
import bs4
import pandas as pd

from bs4 import BeautifulSoup
from collections import namedtuple

In [12]:
class movie:
    def __init__(self, params):
        assert isinstance(params, dict)
        self.cover = params.get('cover')
        self.id = params.get('id')
        self.is_new = params.get('is_new')
        self.playable = params.get('playable')
        self.rate = params.get('rate')
        self.title = params.get('title')
        self.url = params.get('url')
    @property
    def show(self):
        print('title : %s' % (self.title))
        print('rate : %s' % (self.rate))
        print('id : %s' % (self.id))
    def check_url(self):
        url_pattern = re.compile(r'''\b((ftp|https?)://[-\w]+(\.\w[-\w]*)+|(?i:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)+(?-i:com\b|edu\b|biz\b|gov\b|in(?:t|fo)\b|mil\b|net\b|org\b|[a-z][a-z]\b))(:\d+)?(/[^.!,?;"'<>()\[\]{}\s\x7F-\xFF]*(?:[.!,?]+[^.!,?;"'<>()\[\]{}\s\x7F-\xFF]+)*)?''')
        return url_pattern.match(self.url)
    def get_detail(self):
        assert self.check_url(), 'Wrong url format : {}'.format(self.url)
        headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0'}
        response = requests.get(self.url, headers=headers)
        self.detail = response.text
    @property
    def show_detail(self):
        print(self.detail)
    def extract_movie(self):
        home_url = 'https://movie.douban.com'
        movie_pattern = re.compile('/subject/[0-9]+?/')
        movie_index = movie_pattern.findall(self.detail)
        self.movies_url = [home_url + x for x in movie_index]
        return self.movies_url

In [62]:
class movie_detail:
    def __init__(self, url):
        self.soup = self.get_movie_detail(url)
        assert isinstance(self.soup, bs4.BeautifulSoup)
        people_info = self.soup.find_all('div', attrs={'id':'info'})[0]

        # 标题
        try:
            self.title = self.soup.title.string.strip()
        except:
            self.title = 'none'

        # 导演
        try:
            director_temp = people_info.find_all('a', attrs={'rel':'v:directedBy'})[0]
            director_name = director_temp.string
            director_url = self.wrap_url(director_temp['href'])
            self.director = {director_name:director_url}
        except:
            self.director = {'none':'none'}

        # 编剧
        try:
            pattern_composer = re.compile('''<a href="(/celebrity/[0-9]+?/)">(.+?)</a>''')
            composer_temp = pattern_composer.findall(str(people_info))
            self.composer = {}
            for (x, y) in composer_temp:
                self.composer[y] = self.wrap_url(x)
        except:
            self.composer = {'none':'none'}

        # 主演
        try:
            actors_temp = people_info.find_all('a', attrs={'rel':'v:starring'})
            self.actors = {}
            for actor in actors_temp:
                name = actor.string
                url = actor['href']
                self.actors[name] = self.wrap_url(url)
        except:
            self.actors = {'none':'none'}

        # 电影类型
        try:
            movie_class_temp = people_info.find_all('span', attrs={'property':'v:genre'})
            self.movie_class = ','.join([x.string for x in movie_class_temp])
        except:
            self.movie_class = 'none'

        # 制片国家/地区
        try:
            pattern_place = re.compile('<span class="pl">制片国家/地区:</span>(.+?)<br/>')
            self.place = pattern_place.findall(str(people_info))[0].strip()
        except:
            self.place = 'none'

        # 语言
        try:
            pattern_language = re.compile('<span class="pl">语言:</span>(.+?)<br/>')
            self.language = pattern_language.findall(str(people_info))[0].strip()
        except:
            self.language = 'none'

        # 上映时间
        try:
            self.time = people_info.find_all('span', attrs={'property':'v:initialReleaseDate'})[0].string.strip()
        except:
            self.time = 'none'

        # 片长
        try:
            self.length = people_info.find_all('span', attrs={'property':'v:runtime'})[0].string.strip()
        except:
            self.length = 'none'

        # 别名
        try:
            pattern_alias = re.compile('<span class="pl">又名:</span>(.+?)<br/>')
            self.alias = pattern_alias.findall(str(people_info))[0].strip()
        except:
            self.alias = 'none'

        # imdb链接
        try:
            self.IMDB_url = people_info.find_all('a', attrs={'rel':'nofollow'})[0]['href']
        except:
            self.IMDB_url = 'none'
    
    def get_movie_detail(self, url):
        headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0'}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'lxml')
        return soup
    
    def wrap_url(self, url):
        return 'https://movie.douban.com' + url

    @property
    def get_title(self):
        return self.title
    
    @property
    def get_director(self):
        return ','.join(self.director.keys())
    
    @property
    def get_composer(self):
        return ','.join(self.composer.keys())
    
    @property
    def get_actors(self):
        return ','.join(self.actors.keys())
    
    @property
    def get_movie_class(self):
        return self.movie_class
    
    @property
    def get_place(self):
        return self.place
    
    @property
    def get_language(self):
        return self.language
    
    @property
    def get_time(self):
        return self.time
    
    @property
    def get_length(self):
        return self.length
    
    @property
    def get_alias(self):
        return self.alias
    
    @property
    def get_IMDB(self):
        return self.IMDB_url
    
    @property
    def show(self):
        print('title : %s' % self.title)
        print('alias : %s' % self.alias)
        print('director : %s' % list(self.director.keys())[0])
        print('language : %s' % self.language)


url = 'https://movie.douban.com/subject/26270502/'
# a = movie_detail(movie_urls[0])
# print(a.title)
# print(a.director)
# print(a.composer)
# print(a.actors)
# print(a.movie_class)
# print(a.place)
# print(a.language)
# print(a.time)
# print(a.length)
# print(a.alias)
# print(a.IMDB_url)
# a.get_actors
# ', '.join(a.actors.keys())

In [63]:
movie_urls = ['https://movie.douban.com/subject/26270502/', 'https://movie.douban.com/subject/26828215/', 'https://movie.douban.com/subject/26709254/']

In [93]:
list_index = []
list_title = []
list_director = []
list_composer = []
list_actors = []
list_movie_class = []
list_place = []
list_language = []
list_time = []
list_length = []
list_alias = []
list_IMDBurl = []
index_pattern = re.compile('https://movie.douban.com/subject/([0-9]+?)/')


for step, url in enumerate(movie_urls):
    movie_index = index_pattern.findall(url)
    list_index.append(movie_index)
    movie_data = movie_detail(url)
    list_title.append(movie_data.get_title)
    list_director.append(movie_data.get_director)
    list_composer.append(movie_data.get_composer)
    list_actors.append(movie_data.get_actors)
    list_movie_class.append(movie_data.get_movie_class)
    list_place.append(movie_data.get_place)
    list_language.append(movie_data.get_language)
    list_time.append(movie_data.get_time)
    list_length.append(movie_data.get_length)
    list_alias.append(movie_data.get_alias)
    list_IMDBurl.append(movie_data.get_IMDB)
    if (step+1) % 10==0:
        print('Current step : %d, fetch movie: %s' % (step, movie_data.title))

In [94]:
result_table = pd.DataFrame({
        'id':list_index,
        'title':list_title,
        'director':list_director,
        'composer':list_composer, 
        'actors':list_actors,
        'movie_class':list_movie_class,
        'place':list_place,
        'language':list_language,
        'time':list_time,
        'length':list_length,
        'alias':list_alias,
        'IMDB':list_IMDBurl},columns = ['id', 'title', 'alias', 'director', 'composer',
                                       'actors', 'movie_class', 'place', 'language', 'time', 
                                       'length', 'IMDB'])

In [95]:
result_table


Out[95]:
id title alias director composer actors movie_class place language time length IMDB
0 [26270502] 绣春刀II:修罗战场 (豆瓣) 绣春刀2:修罗战场 / 绣春刀:修罗场 / 绣春刀前传 / Brotherhood of B... 路阳 禹扬,路阳,陈舒 杨轶,金士杰,袁文康,刘峰超,辛芷蕾,马赫,周一围,武强,张震,陈齐威,刘端端,张译,姜晓冲... 剧情,动作,武侠,古装 中国大陆 汉语普通话 2017-07-19(中国大陆) 120分钟 http://www.imdb.com/title/tt7055592
1 [26828215] 人在驴途 (豆瓣) Following the Donkey 杨琳 张亚光 张亚光,李若嘉,何苗,候凯文,邵峰,黄成麟 喜剧 中国大陆 汉语普通话 2016-07-09(中国大陆) 101分钟 none
2 [26709254] 斗地主传奇之双王之王 (豆瓣) 斗地主传奇 殷博 金梓壑,武凌,温超,周凯文,秦沛,曾江,梁家仁,黄一飞 剧情,喜剧 中国大陆 汉语普通话 2016-03(中国大陆) 85分钟 none

In [39]:
from pickle import dump,load
dump(a, open('temp.pickle', 'wb'))
b = load(open('temp.pickle', 'rb'))