安装
pip install newspaper3k
直接访问URL
from newspaper import Article
url = 'http://news.ifeng.com/c/8304wtslEL9'
news = Article(url, language='zh')
news.download()
news.parse()
print(news.text)
指定HTML
import requests
from newspaper import Article
body = requests.get('http://news.ifeng.com/c/8304wtslEL9').text
news = Article(url='', language='zh')
news.set_html(body)
news.parse()
print(news.text)
保留HTML结构
import requests
from newspaper import Article
body = requests.get('http://news.ifeng.com/c/8304wtslEL9').text
news = Article(url='', language='zh', keep_article_html=True)
news.set_html(body)
news.parse()
print(news.article_html)
解码中文乱码字符
import html
import requests
from newspaper import Article
body = requests.get('http://news.ifeng.com/c/8304wtslEL9').text
news = Article(url='', language='zh', keep_article_html=True)
news.set_html(body)
news.parse()
article = html.unescape(news.article_html)
print(article)