看了好几年的小说网站,没有小广告,文章也齐全,备份一下爬虫代码,需要代理。
- 安装依赖
pip install requests bs4
# -*- coding: utf-8 -*-
import re
import sys
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
proxies = {}
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
}
def search_book(book):
url = "https://www.69shu.com/modules/article/search.php"
data = {"searchkey": book, "searchtype": "all"}
form = "&".join([i+"="+quote(j, encoding='gbk') for i,j in data.items()])
headers['content-type'] = 'application/x-www-form-urlencoded'
r = requests.post(url, data=form, headers=headers, proxies=proxies, allow_redirects=False)
headers.pop('content-type')
location = r.headers.get("location")
return location
def get_node_list(book_link):
r = requests.get(book_link, headers=headers, proxies=proxies)
r.encoding = "gbk"
node_list = re.findall('<li .*?><a href="(.*?)">(.*?)</a></li>', r.text)
return node_list
def get_node_text(node_link):
r = requests.get(node_link, headers=headers, proxies=proxies)
r.encoding = "gbk"
text = re.findall('<div id="txtright">.*?</div>(.*?)<div class="bottom-ad">', r.text, re.S)[0]
text = text.replace(' ', '\t')
# text = text.replace('<br /><br />', '\n').replace(' ', ' ').replace('<br />', '\n')
soup = BeautifulSoup(text, "html.parser")
text = "\n\n".join(soup.text.split("\n"))
return text
def main(book):
book_link = search_book(book)
if not book_link:
print("查无此书,如确认书名无误,便是没有收录此书。")
sys.exit()
book_link = book_link.replace("/txt", "").replace(".htm", "/")
node_list = get_node_list(book_link)
total = len(node_list)
f = open(book+".txt", "w", encoding="utf-8")
for index,(node_link,node_name) in enumerate(node_list):
log_text = "下载进度: %d%s, %s"%(int(round(index/total, 2)*100), "%",node_name)
sys.stdout.write(log_text)
sys.stdout.flush()
node_text = get_node_text(node_link)
f.write(node_text + "\n" * 10)
f.flush()
sys.stdout.write("\x08"*len(log_text))
f.close()
print("下载完成:", book+".txt")
if __name__ == "__main__":
main("灵境行者")