我们把它升级成 专业批量下载版,支持:
- ✅ 多个 URL 一次粘贴(每行一个)
- ✅ 自动识别文章标题作为文件名
- ✅ 自动清理非法字符
- ✅ 批量打包成 ZIP 下载
- ✅ 稳定 Playwright 渲染
- ✅ 内存生成,无临时文件
一、安装依赖¶
pip install flask playwright beautifulsoup4 reportlab
playwright install
二、完整升级版 Flask(批量 + 自动文件名 + ZIP)¶
保存为:
app.py
✅ 完整代码¶
import asyncio
import re
import io
import zipfile
from flask import Flask, render_template_string, request, send_file
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch
app = Flask(__name__)
# =========================
# 抓取文章
# =========================
async def fetch_article(url):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto(url, timeout=60000)
await page.wait_for_selector("article")
html = await page.content()
await browser.close()
soup = BeautifulSoup(html, "html.parser")
title_tag = soup.find("h1")
title = title_tag.get_text(strip=True) if title_tag else "No Title Found"
article = soup.find("article")
paragraphs = []
if article:
for p in article.find_all("p"):
text = p.get_text(strip=True)
if not text:
continue
# 去广告关键词
if any(keyword in text.lower() for keyword in [
"advertisement",
"sponsored",
"read more",
"subscribe",
"sign up",
"related",
"continue reading"
]):
continue
paragraphs.append(text)
content = "\n\n".join(paragraphs)
content = clean_text(content)
return title, content
def clean_text(text):
text = re.sub(r'\n\s*\n', '\n\n', text)
return text.strip()
# =========================
# 生成 PDF
# =========================
def generate_pdf(title, content):
buffer = io.BytesIO()
doc = SimpleDocTemplate(buffer, pagesize=letter)
elements = []
styles = getSampleStyleSheet()
title_style = styles["Heading1"]
body_style = styles["BodyText"]
elements.append(Paragraph(title, title_style))
elements.append(Spacer(1, 0.5 * inch))
for paragraph in content.split("\n\n"):
elements.append(Paragraph(paragraph, body_style))
elements.append(Spacer(1, 0.2 * inch))
doc.build(elements)
buffer.seek(0)
return buffer
# =========================
# HTML 页面
# =========================
HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
<title>MSN 批量下载器</title>
</head>
<body style="font-family: Arial; text-align: center; margin-top: 50px;">
<h1>MSN 文章批量下载 PDF</h1>
<p>每行一个链接</p>
<form method="post">
<textarea name="urls" rows="10"
style="width: 700px; padding: 10px;"
placeholder="每行粘贴一个文章链接"
required></textarea>
<br><br>
<button type="submit" style="padding: 10px 20px;">
批量下载
</button>
</form>
</body>
</html>
"""
@app.route("/", methods=["GET", "POST"])
def index():
if request.method == "POST":
urls_text = request.form.get("urls")
urls = [u.strip() for u in urls_text.split("\n") if u.strip()]
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
for url in urls:
try:
title, content = asyncio.run(fetch_article(url))
if not content:
continue
pdf_buffer = generate_pdf(title, content)
# 自动清理文件名
safe_title = re.sub(r'[\\/*?:"<>|]', "", title)
safe_title = safe_title.strip()[:80]
filename = safe_title + ".pdf"
zipf.writestr(filename, pdf_buffer.read())
except Exception as e:
print("抓取失败:", url, e)
zip_buffer.seek(0)
return send_file(
zip_buffer,
as_attachment=True,
download_name="msn_articles.zip",
mimetype="application/zip"
)
return render_template_string(HTML_TEMPLATE)
if __name__ == "__main__":
app.run(debug=True)
三、运行方式¶
python app.py
打开浏览器:
http://127.0.0.1:5000
四、批量流程¶
粘贴多个URL
↓
逐个 Playwright 渲染
↓
提取标题 + 正文
↓
生成独立 PDF
↓
打包 ZIP
↓
浏览器下载
五、当前版本能力¶
| 功能 | 状态 |
|---|---|
| 单篇下载 | ✅ |
| 批量下载 | ✅ |
| 自动标题文件名 | ✅ |
| 去广告 | ✅ |
| ZIP 打包 | ✅ |
| 内存处理 | ✅ |
你想往“企业级采集系统”方向升级吗 😎