- settings启用自定义媒体管道
ITEM_PIPELINES = {'scrapy.pipelines.MyVideosPipeline': 1}
- settings设置FILES_STORE
FILES_STORE = '/path/to/valid/dir'
- pipelines定义MyVideosPipeline
import scrapy
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
from scrapy.pipelines.files import FilesPipeline
class MyVideosPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
"""
文件名默认是url哈希,重写为视频名称title
"""
image_filename = item["title"] + '.mp4'
return image_filename
def get_media_requests(self, item, info):
for video_url in item['video_urls']:
yield scrapy.Request(video_url)
def item_completed(self, results, item, info):
video_paths = [x['path'] for ok, x in results if ok]
if not video_paths:
raise DropItem("Item contains no videos")
adapter = ItemAdapter(item)
adapter['video_paths'] = video_paths
return item
class MyImagesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
"""
文件名默认是url哈希,重写为图片名称title
"""
filename = request.meta['filename'] + '.jpg'
return filename
def get_media_requests(self, item, info):
"""
设置图片下载器url请求
"""
yield scrapy.Request(url=item['image_url'], meta={"filename":item["title"]})
def item_completed(self, results, item, info):
"""
下载成功后的回调函数
"""
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no image")
item['image_path'] = image_paths[0]
return item