scrapy pipeline下载视频

cooolr 于 2023-05-08 发布
  1. settings启用自定义媒体管道
ITEM_PIPELINES = {'scrapy.pipelines.MyVideosPipeline': 1}
  1. settings设置FILES_STORE
FILES_STORE = '/path/to/valid/dir'
  1. pipelines定义MyVideosPipeline
import scrapy
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
from scrapy.pipelines.files import FilesPipeline

class MyVideosPipeline(FilesPipeline):

    def file_path(self, request, response=None, info=None, *, item=None):
        """
        文件名默认是url哈希,重写为视频名称title
        """
        image_filename = item["title"] + '.mp4'
        return image_filename

    def get_media_requests(self, item, info):
        for video_url in item['video_urls']:
            yield scrapy.Request(video_url)

    def item_completed(self, results, item, info):
        video_paths = [x['path'] for ok, x in results if ok]
        if not video_paths:
            raise DropItem("Item contains no videos")
        adapter = ItemAdapter(item)
        adapter['video_paths'] = video_paths
        return item
class MyImagesPipeline(ImagesPipeline):

    def file_path(self, request, response=None, info=None):
        """
        文件名默认是url哈希,重写为图片名称title
        """
        filename = request.meta['filename'] + '.jpg'
        return filename

    def get_media_requests(self, item, info):
        """
        设置图片下载器url请求
        """
        yield scrapy.Request(url=item['image_url'], meta={"filename":item["title"]})

    def item_completed(self, results, item, info):
        """
        下载成功后的回调函数
        """
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no image")
        item['image_path'] = image_paths[0]
        return item