import requests
from bs4 import BeautifulSoup
import re, json
from tqdm import tqdm
import traceback

class Crawler():
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.68'
        }

    def get_movie_list(self, url):
        reponse = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(reponse.text, 'html.parser')
        node_a = soup.select('#content .grid_view .item .hd a')
        movie_list = []
        for a in node_a:
            url = a.attrs['href']
            name = a.select('span')[0].text
            movie_list.append((name, url))
        return movie_list

    def _parse_id(self, url):
        obj = re.search('subject/(.*?)/', url)
        return obj.group(1)

    def _parse_summary(self, info):
        span = info.select('#link-report-intra span')
        if len(span)>=2:
            summary = span[-2]
        else:
            summary = span[0]
        return re.sub('\s{2,}', '', summary.text)
        
    def _parse_directors(self, info):
        obj = re.search('<span class="pl">导演</span>: <span class="attrs">(.*?)</span>', info)
        result = []
        if obj:
            obj_soup = BeautifulSoup(obj.group(1), 'html.parser')
            node_a = obj_soup.select('a')
            for a in node_a:
                result.append((a.attrs['href'], a.text))
        return result
    
    def _parse_writers(self, info):
        obj = re.search('<span class="pl">编剧</span>: <span class="attrs">(.*?)</span>', info)
        result = []
        if obj:
            obj_soup = BeautifulSoup(obj.group(1), 'html.parser')
            node_a = obj_soup.select('a')
            for a in node_a:
                result.append((a.attrs['href'], a.text))
        return result
    
    def _parse_actors(self, info):
        obj = re.search('<span class="pl">主演</span>: <span class="attrs">(.*?)</span>', info)
        result = []
        if obj:
            obj_soup = BeautifulSoup(obj.group(1), 'html.parser')
            node_a = obj_soup.select('a')
            for a in node_a:
                result.append((a.attrs['href'], a.text))
        return result

    def _parse_genres(self, info):
        obj = re.search('<span class="pl">类型:</span>(.*?)<br/>', info)
        result = []
        if obj:
            obj_soup = BeautifulSoup(obj.group(1), 'html.parser')
            node_span = obj_soup.select('span')
            for span in node_span:
                result.append(span.text)
        return result

    def _parse_countries(self, info):
        obj = re.search('<span class="pl">制片国家/地区:</span>(.*?)<br/>', info)
        result = []
        if obj:
            result = [t.strip() for t in obj.group(1).split('/')]
        return result

    def _parse_languages(self, info):
        obj = re.search('<span class="pl">语言:</span>(.*?)<br/>', info)
        result = []
        if obj:
            result = [t.strip() for t in obj.group(1).split('/')]
        return result
    
    def _parse_pubdates(self, info):
        obj = re.search('<span class="pl">上映日期:</span>(.*?)<br/>', info)
        result = []
        if obj:
            obj_soup = BeautifulSoup(obj.group(1), 'html.parser')
            node_span = obj_soup.select('span')
            for span in node_span:
                result.append(span.text)
        return result
    
    def _parse_durations(self, info):
        obj = re.search('<span class="pl">片长:</span>(.*?)<br/>', info)
        result = []
        if obj:
            obj_soup = BeautifulSoup(obj.group(1), 'html.parser')
            node_span = obj_soup.select('span')
            for span in node_span:
                result.append(span.text)
        return result
    
    def _parse_other_names(self, info):
        obj = re.search('<span class="pl">又名:</span>(.*?)<br/>', info)
        result = []
        if obj:
            result = [t.strip() for t in obj.group(1).split('/')]
        return result
    
    def _parse_imdb(self, info):
        obj = re.search('<span class="pl">IMDb:</span>(.*?)<br/>', info)
        result = ''
        if obj:
            result = obj.group(1).strip()
        return result

    def get_movie_detail(self, detail):
        try:
            name, url = detail
            # 保存数据
            movie = {}
            movie['name'] = name
            movie['url'] = url
            movie['id'] = self._parse_id(url)
            # 获取页面源码
            reponse = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(reponse.text, 'html.parser')
            # 获取节点内容
            movie['img'] = soup.select('#mainpic img')[0].attrs['src']
            movie['rating_num'] = soup.select('#interest_sectl .rating_num')[0].text
            movie['summary'] = self._parse_summary(soup)
            # 获取info源码
            info = str(soup.select('#info'))
            # with open('info.html', 'w') as file:
            #     file.write(info)
            movie['directors'] = self._parse_directors(info)
            movie['writers'] = self._parse_writers(info)
            movie['actors'] = self._parse_actors(info)
            movie['genres'] = self._parse_genres(info)
            movie['countries'] = self._parse_countries(info)
            movie['languages'] = self._parse_languages(info)
            movie['pubdates'] = self._parse_pubdates(info)
            movie['durations'] = self._parse_durations(info)
            movie['other_names'] = self._parse_other_names(info)
            movie['imdb'] = self._parse_imdb(info)
            
            # 写入文件
            with open('douban_top250_movies.json', 'a') as file:
                file.write(json.dumps(movie, ensure_ascii=False) + '\n')
        except:
            print(detail, traceback.print_exc())


if __name__ == '__main__':
    crawler = Crawler()
    for i in range(10):
        url = 'https://movie.douban.com/top250?start=%s&filter=' % (i*25)
        movie_list = crawler.get_movie_list(url)
        for detail in tqdm(movie_list, desc=f'第{i}页'):
            crawler.get_movie_detail(detail)
            