20260302 100639 Msn文章下载工具

20260302_100639_MSN文章下载工具.md

我们把它升级成 专业批量下载版,支持:

  • ✅ 多个 URL 一次粘贴(每行一个)
  • ✅ 自动识别文章标题作为文件名
  • ✅ 自动清理非法字符
  • ✅ 批量打包成 ZIP 下载
  • ✅ 稳定 Playwright 渲染
  • ✅ 内存生成,无临时文件

一、安装依赖

pip install flask playwright beautifulsoup4 reportlab
playwright install

二、完整升级版 Flask(批量 + 自动文件名 + ZIP)

保存为:

app.py

✅ 完整代码

import asyncio
import re
import io
import zipfile

from flask import Flask, render_template_string, request, send_file
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup

from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch

app = Flask(__name__)

# =========================
# 抓取文章
# =========================
async def fetch_article(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        await page.goto(url, timeout=60000)
        await page.wait_for_selector("article")

        html = await page.content()
        await browser.close()

    soup = BeautifulSoup(html, "html.parser")

    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "No Title Found"

    article = soup.find("article")
    paragraphs = []

    if article:
        for p in article.find_all("p"):
            text = p.get_text(strip=True)
            if not text:
                continue

            # 去广告关键词
            if any(keyword in text.lower() for keyword in [
                "advertisement",
                "sponsored",
                "read more",
                "subscribe",
                "sign up",
                "related",
                "continue reading"
            ]):
                continue

            paragraphs.append(text)

    content = "\n\n".join(paragraphs)
    content = clean_text(content)

    return title, content


def clean_text(text):
    text = re.sub(r'\n\s*\n', '\n\n', text)
    return text.strip()


# =========================
# 生成 PDF
# =========================
def generate_pdf(title, content):
    buffer = io.BytesIO()
    doc = SimpleDocTemplate(buffer, pagesize=letter)
    elements = []

    styles = getSampleStyleSheet()
    title_style = styles["Heading1"]
    body_style = styles["BodyText"]

    elements.append(Paragraph(title, title_style))
    elements.append(Spacer(1, 0.5 * inch))

    for paragraph in content.split("\n\n"):
        elements.append(Paragraph(paragraph, body_style))
        elements.append(Spacer(1, 0.2 * inch))

    doc.build(elements)
    buffer.seek(0)
    return buffer


# =========================
# HTML 页面
# =========================
HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
    <title>MSN 批量下载器</title>
</head>
<body style="font-family: Arial; text-align: center; margin-top: 50px;">
    <h1>MSN 文章批量下载 PDF</h1>
    <p>每行一个链接</p>
    <form method="post">
        <textarea name="urls" rows="10"
                  style="width: 700px; padding: 10px;"
                  placeholder="每行粘贴一个文章链接"
                  required></textarea>
        <br><br>
        <button type="submit" style="padding: 10px 20px;">
            批量下载
        </button>
    </form>
</body>
</html>
"""


@app.route("/", methods=["GET", "POST"])
def index():
    if request.method == "POST":
        urls_text = request.form.get("urls")
        urls = [u.strip() for u in urls_text.split("\n") if u.strip()]

        zip_buffer = io.BytesIO()

        with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
            for url in urls:
                try:
                    title, content = asyncio.run(fetch_article(url))

                    if not content:
                        continue

                    pdf_buffer = generate_pdf(title, content)

                    # 自动清理文件名
                    safe_title = re.sub(r'[\\/*?:"<>|]', "", title)
                    safe_title = safe_title.strip()[:80]

                    filename = safe_title + ".pdf"

                    zipf.writestr(filename, pdf_buffer.read())

                except Exception as e:
                    print("抓取失败:", url, e)

        zip_buffer.seek(0)

        return send_file(
            zip_buffer,
            as_attachment=True,
            download_name="msn_articles.zip",
            mimetype="application/zip"
        )

    return render_template_string(HTML_TEMPLATE)


if __name__ == "__main__":
    app.run(debug=True)

三、运行方式

python app.py

打开浏览器:

http://127.0.0.1:5000

四、批量流程

粘贴多个URL
        ↓
逐个 Playwright 渲染
        ↓
提取标题 + 正文
        ↓
生成独立 PDF
        ↓
打包 ZIP
        ↓
浏览器下载

五、当前版本能力

功能 状态
单篇下载
批量下载
自动标题文件名
去广告
ZIP 打包
内存处理

你想往“企业级采集系统”方向升级吗 😎