scrapy pipeline下载图片

cooolr 于 2022-05-06 发布
  1. settings启用自定义媒体管道
ITEM_PIPELINES = {'scrapy.pipelines.MyImagesPipeline': 1}
  1. settings设置IMAGES_STORE
IMAGES_STORE = '/path/to/valid/dir'
  1. pipelines定义MyImagesPipeline
import scrapy
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline

class MyImagesPipeline(ImagesPipeline):

    def file_path(self, request, response=None, info=None, *, item=None):
        """
        文件名默认是url哈希,重写为图片名称title
        """
        image_filename = item["title"] + '.jpg'
        return image_filename

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        adapter = ItemAdapter(item)
        adapter['image_paths'] = image_paths
        return item
class MyImagesPipeline(ImagesPipeline):

    def file_path(self, request, response=None, info=None):
        """
        文件名默认是url哈希,重写为图片名称title
        """
        filename = request.meta['filename'] + '.jpg'
        return filename

    def get_media_requests(self, item, info):
        """
        设置图片下载器url请求
        """
        yield scrapy.Request(url=item['image_url'], meta={"filename":item["title"]})

    def item_completed(self, results, item, info):
        """
        下载成功后的回调函数
        """
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no image")
        item['image_path'] = image_paths[0]
        return item