- settings启用自定义媒体管道
ITEM_PIPELINES = {'scrapy.pipelines.MyImagesPipeline': 1}
- settings设置IMAGES_STORE
IMAGES_STORE = '/path/to/valid/dir'
- pipelines定义MyImagesPipeline
import scrapy
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class MyImagesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
"""
文件名默认是url哈希,重写为图片名称title
"""
image_filename = item["title"] + '.jpg'
return image_filename
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
adapter = ItemAdapter(item)
adapter['image_paths'] = image_paths
return item
class MyImagesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
"""
文件名默认是url哈希,重写为图片名称title
"""
filename = request.meta['filename'] + '.jpg'
return filename
def get_media_requests(self, item, info):
"""
设置图片下载器url请求
"""
yield scrapy.Request(url=item['image_url'], meta={"filename":item["title"]})
def item_completed(self, results, item, info):
"""
下载成功后的回调函数
"""
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no image")
item['image_path'] = image_paths[0]
return item