小说爬虫

cooolr 于 2022-09-21 发布

看了好几年的小说网站,没有小广告,文章也齐全,备份一下爬虫代码,需要代理。

pip install requests bs4
# -*- coding: utf-8 -*-

import re
import sys
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote

proxies = {}

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
}

def search_book(book):
    url = "https://www.69shu.com/modules/article/search.php"
    data = {"searchkey": book, "searchtype": "all"}
    form = "&".join([i+"="+quote(j, encoding='gbk') for i,j in data.items()])
    headers['content-type'] = 'application/x-www-form-urlencoded'
    r = requests.post(url, data=form, headers=headers, proxies=proxies, allow_redirects=False)
    headers.pop('content-type')
    location = r.headers.get("location")
    return location

def get_node_list(book_link):
    r = requests.get(book_link, headers=headers, proxies=proxies)
    r.encoding = "gbk"
    node_list = re.findall('<li .*?><a href="(.*?)">(.*?)</a></li>', r.text)
    return node_list

def get_node_text(node_link):
    r = requests.get(node_link, headers=headers, proxies=proxies)
    r.encoding = "gbk"
    text = re.findall('<div id="txtright">.*?</div>(.*?)<div class="bottom-ad">', r.text, re.S)[0]
    text = text.replace('&emsp;', '\t')
    # text = text.replace('<br /><br />', '\n').replace('&emsp;', ' ').replace('<br />', '\n')
    soup = BeautifulSoup(text, "html.parser")
    text = "\n\n".join(soup.text.split("\n"))
    return text

def main(book):
    book_link = search_book(book)
    if not book_link:
        print("查无此书,如确认书名无误,便是没有收录此书。")
        sys.exit()
    book_link = book_link.replace("/txt", "").replace(".htm", "/")
    node_list = get_node_list(book_link)
    total = len(node_list)
    f = open(book+".txt", "w", encoding="utf-8")
    for index,(node_link,node_name) in enumerate(node_list):
        log_text = "下载进度: %d%s, %s"%(int(round(index/total, 2)*100), "%",node_name)
        sys.stdout.write(log_text)
        sys.stdout.flush()
        node_text = get_node_text(node_link)
        f.write(node_text + "\n" * 10)
        f.flush()
        sys.stdout.write("\x08"*len(log_text))
    f.close()
    print("下载完成:", book+".txt")

if __name__ == "__main__":
    main("灵境行者")