Python腳本實(shí)現(xiàn)批量導(dǎo)出網(wǎng)站指定文章

更新時(shí)間：2026年02月12日 09:11:34 作者：麥芽糖0219

這篇文章主要為大家詳細(xì)介紹了如何利用Python腳本實(shí)現(xiàn)批量導(dǎo)出網(wǎng)站指定文章,文中的示例代碼講解詳細(xì),感興趣的小伙伴可以跟隨小編一起學(xué)習(xí)一下

一導(dǎo)出全部已發(fā)布文章

首先，需要在本地安裝3.8版本以上的python，安裝python步驟

檢查是否安裝成功

pip3 --version

安裝后執(zhí)行

pip3 install requests beautifulsoup4 markdownify

新建腳本，腳本名字隨意，這里是：csdn_downloader.py

腳本內(nèi)容如下：

# -*- coding: utf-8 -*-
import os
import re
import requests
import time
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from urllib.parse import urlparse, unquote
import hashlib
from pathlib import Path

# ================== 配置區(qū) ==================
CSDN_USERNAME = "qq_33417321"  # ←←← 修改為你想下載的用戶名
SAVE_DIR = Path("csdn_articles")  # 文章保存根目錄（自動(dòng)跨平臺(tái)）
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
    "Referer": "https://blog.csdn.net/"
}

def sanitize_filename(name: str) -> str:
    """清理文件名，移除 Windows 非法字符和‘原創(chuàng)'字樣"""
    name = name.replace("原創(chuàng)", "").strip()
    # 移除 Windows 非法字符
    name = re.sub(r'[\\/*?:"<>|\r\n]', "_", name)
    return name or "untitled"

def get_article_list(username):
    """獲取博主文章列表（標(biāo)題和URL）"""
    url = f"https://blog.csdn.net/{username}/article/list"
    articles = []
    page = 1
    while True:
        response = requests.get(f"{url}/{page}", headers=HEADERS)
        soup = BeautifulSoup(response.text, 'html.parser')
        items = soup.select(".article-list .article-item-box")
        if not items:
            break
        for item in items:
            title_elem = item.select_one("h4 a")
            if not title_elem:
                continue
            title = title_elem.text.strip()
            link = title_elem["href"]
            articles.append({"title": title, "url": link})
        page += 1
        time.sleep(1)
    return articles

def download_image(img_url, save_path: Path):
    """下載單張圖片到本地"""
    try:
        img_headers = HEADERS.copy()
        img_headers["Accept"] = "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8"
        response = requests.get(img_url, headers=img_headers, stream=True, timeout=30)
        if response.status_code == 200:
            save_path.parent.mkdir(parents=True, exist_ok=True)
            with open(save_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            return True
        else:
            print(f"圖片下載失?。顟B(tài)碼：{response.status_code}）：{img_url}")
            return False
    except Exception as e:
        print(f"圖片下載異常：{img_url}，錯(cuò)誤：{str(e)}")
        return False

def get_image_extension(img_url):
    """從URL中獲取圖片擴(kuò)展名"""
    parsed_url = urlparse(img_url)
    path = parsed_url.path.lower()
    extensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg']
    for ext in extensions:
        if ext in path:
            return ext
    return '.jpg'

def process_images_in_content(content, article_title):
    """處理內(nèi)容中的圖片，下載并替換為本地路徑"""
    soup = BeautifulSoup(content, 'html.parser')
    img_tags = soup.find_all('img')
    if not img_tags:
        return content

    # 清理文章標(biāo)題用于路徑
    safe_title = sanitize_filename(article_title)

    global_image_dir = SAVE_DIR / "images"
    article_image_dir = global_image_dir / safe_title

    for img in img_tags:
        img_url = img.get('src', '')
        if not img_url:
            continue

        # 處理協(xié)議相對(duì)路徑
        if img_url.startswith('//'):
            img_url = 'https:' + img_url
        elif not img_url.startswith(('http://', 'https://')):
            continue  # 跳過(guò)無(wú)法處理的相對(duì)路徑

        try:
            img_hash = hashlib.md5(img_url.encode()).hexdigest()[:8]
            img_ext = get_image_extension(img_url)
            img_filename = f"{img_hash}{img_ext}"

            local_img_path = article_image_dir / img_filename
            # Markdown 中使用正斜杠（/），兼容所有平臺(tái)
            md_img_path = f"./images/{safe_title}/{img_filename}"

            if not local_img_path.exists():
                print(f"  下載圖片：{img_filename}")
                if download_image(img_url, local_img_path):
                    img['src'] = md_img_path
                else:
                    print(f"  圖片下載失敗，保留原鏈接：{img_url}")
            else:
                img['src'] = md_img_path

        except Exception as e:
            print(f"  處理圖片時(shí)出錯(cuò)：{img_url}，錯(cuò)誤：{str(e)}")
            continue

    return str(soup)

def download_article(url, article_title):
    """下載單篇文章，處理圖片后轉(zhuǎn)為Markdown"""
    try:
        response = requests.get(url, headers=HEADERS, timeout=30)
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.select_one("article")
        if not content:
            print(f"  未找到文章內(nèi)容")
            return None

        processed_content = process_images_in_content(str(content), article_title)
        markdown_content = md(processed_content)
        return markdown_content
    except Exception as e:
        print(f"  下載文章時(shí)出錯(cuò)：{str(e)}")
        return None

def save_to_markdown(title, content, save_dir: Path):
    """保存Markdown文件"""
    save_dir.mkdir(parents=True, exist_ok=True)
    safe_title = sanitize_filename(title)
    filename = save_dir / f"{safe_title}.md"

    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"# {title}\n\n")
        f.write(content)

    print(f"  已保存：{filename}")
    return filename

if __name__ == "__main__":
    print("開(kāi)始獲取文章列表...")
    articles = get_article_list(CSDN_USERNAME)
    print(f"找到 {len(articles)} 篇文章")

    success_count = 0
    fail_count = 0

    for i, article in enumerate(articles, 1):
        title = article["title"]
        url = article["url"]
        print(f"\n[{i}/{len(articles)}] 處理文章：{title}")

        content = download_article(url, title)
        if content:
            save_to_markdown(title, content, SAVE_DIR)
            success_count += 1
        else:
            print(f"  文章下載失?。簕title}")
            fail_count += 1

        time.sleep(2)

    print(f"\n處理完成！成功：{success_count}篇，失敗：{fail_count}篇")
    print(f"文章保存在：{SAVE_DIR.resolve()}")
    print("圖片保存在：./images/ 目錄下，Markdown文件可離線查看")

其中，腳本里CSDN_USERNAME的值，改為你要獲取的csdn的用戶名

獲取用戶名：點(diǎn)擊作者頭像后，鏈接里的這個(gè)值就是用戶名（紅框里的內(nèi)容）

執(zhí)行腳本

python csdn_downloader.py

執(zhí)行日志入下：

由于要下載csdn文章里的圖片，所以很慢，靜靜等待即可。下載過(guò)程中，會(huì)在腳本所在目錄生成一個(gè)csdn_articles文件夾，里邊是md文件以及存md里的圖片的文件夾。

二導(dǎo)出指定日期后的文章

上邊的腳本，一次性導(dǎo)出了所有已發(fā)布的文章，但是有時(shí)候我們的文章太多，每次備份不需要全部導(dǎo)出，只導(dǎo)出指定時(shí)間以后的文章，那么使用如下腳本即可，如腳本名為new.py，內(nèi)容如下

# -*- coding: utf-8 -*-
import os
import re
import requests
import time
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from urllib.parse import urlparse, unquote
import hashlib
from pathlib import Path
from datetime import datetime

# ================== 配置區(qū) ==================
CSDN_USERNAME = "qq_33417321"  # ←←← 修改為你想下載的用戶名
SAVE_DIR = Path("csdn_articles")  # 文章保存根目錄（自動(dòng)跨平臺(tái)）

# ?? 新增：設(shè)置最小發(fā)布日期（含）
MIN_PUBLISH_DATE = datetime(2026, 2, 7)  # 只下載 2026年2月1日及之后的文章

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
    "Referer": "https://blog.csdn.net/"
}

def sanitize_filename(name: str) -> str:
    """清理文件名，移除 Windows 非法字符和‘原創(chuàng)'字樣"""
    name = name.replace("原創(chuàng)", "").strip()
    name = re.sub(r'[\\/*?:"<>|\r\n]', "_", name)
    return name or "untitled"

def parse_publish_date(date_str: str) -> datetime | None:
    """嘗試解析 CSDN 日期字符串，支持 '2025-06-15 10:30:00' 或 '2025-06-15' 等"""
    date_str = date_str.strip()
    for fmt in [
        "%Y-%m-%d %H:%M:%S",  # 帶秒，如 2025-06-15 10:30:09
        "%Y-%m-%d %H:%M",     # 不帶秒，如 2025-06-15 10:30
        "%Y-%m-%d"            # 只有日期，如 2025-06-15
    ]:
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue
    print(f"  無(wú)法解析日期：{date_str}")
    return None

def get_article_list(username, min_date=None):
    """獲取博主文章列表（標(biāo)題、URL、發(fā)布時(shí)間），可選按 min_date 過(guò)濾"""
    url = f"https://blog.csdn.net/{username}/article/list"
    articles = []
    page = 1
    early_stop = False

    while not early_stop:
        print(f"  正在抓取第 {page} 頁(yè)...")
        response = requests.get(f"{url}/{page}", headers=HEADERS)
        soup = BeautifulSoup(response.text, 'html.parser')
        items = soup.select(".article-list .article-item-box")

        if not items:
            break

        current_page_has_valid = False  # 當(dāng)前頁(yè)是否有滿足條件的文章

        for item in items:
            title_elem = item.select_one("h4 a")
            date_elem = item.select_one(".date")  # CSDN 通常用 .date 類(lèi)表示發(fā)布時(shí)間

            if not title_elem:
                continue

            title = title_elem.text.strip()
            link = title_elem["href"]
            pub_date = None

            if date_elem:
                raw_date = date_elem.text.strip()
                pub_date = parse_publish_date(raw_date)

            # 如果設(shè)置了最小日期，且文章發(fā)布時(shí)間早于該日期，則跳過(guò)
            if min_date and pub_date and pub_date < min_date:
                continue

            # 如果發(fā)布時(shí)間未知但設(shè)置了 min_date，保守起見(jiàn)也跳過(guò)（或可選擇保留）
            if min_date and not pub_date:
                print(f"  警告：無(wú)法獲取文章 [{title}] 的發(fā)布時(shí)間，跳過(guò)（因設(shè)置了日期過(guò)濾）")
                continue

            articles.append({
                "title": title,
                "url": link,
                "publish_date": pub_date
            })
            current_page_has_valid = True

        # 如果當(dāng)前頁(yè)沒(méi)有任何有效文章（全部早于 min_date），可提前終止
        if min_date and not current_page_has_valid and page > 1:
            print("  后續(xù)頁(yè)面文章均早于指定日期，停止翻頁(yè)。")
            early_stop = True

        page += 1
        time.sleep(1)

    return articles

# ========== 以下函數(shù)保持不變 ==========
def download_image(img_url, save_path: Path):
    try:
        img_headers = HEADERS.copy()
        img_headers["Accept"] = "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8"
        response = requests.get(img_url, headers=img_headers, stream=True, timeout=30)
        if response.status_code == 200:
            save_path.parent.mkdir(parents=True, exist_ok=True)
            with open(save_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            return True
        else:
            print(f"圖片下載失?。顟B(tài)碼：{response.status_code}）：{img_url}")
            return False
    except Exception as e:
        print(f"圖片下載異常：{img_url}，錯(cuò)誤：{str(e)}")
        return False

def get_image_extension(img_url):
    parsed_url = urlparse(img_url)
    path = parsed_url.path.lower()
    extensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg']
    for ext in extensions:
        if ext in path:
            return ext
    return '.jpg'

def process_images_in_content(content, article_title):
    soup = BeautifulSoup(content, 'html.parser')
    img_tags = soup.find_all('img')
    if not img_tags:
        return content

    safe_title = sanitize_filename(article_title)
    global_image_dir = SAVE_DIR / "images"
    article_image_dir = global_image_dir / safe_title

    for img in img_tags:
        img_url = img.get('src', '')
        if not img_url:
            continue
        if img_url.startswith('//'):
            img_url = 'https:' + img_url
        elif not img_url.startswith(('http://', 'https://')):
            continue

        try:
            img_hash = hashlib.md5(img_url.encode()).hexdigest()[:8]
            img_ext = get_image_extension(img_url)
            img_filename = f"{img_hash}{img_ext}"
            local_img_path = article_image_dir / img_filename
            md_img_path = f"./images/{safe_title}/{img_filename}"

            if not local_img_path.exists():
                print(f" 下載圖片：{img_filename}")
                if download_image(img_url, local_img_path):
                    img['src'] = md_img_path
                else:
                    print(f" 圖片下載失敗，保留原鏈接：{img_url}")
            else:
                img['src'] = md_img_path
        except Exception as e:
            print(f" 處理圖片時(shí)出錯(cuò)：{img_url}，錯(cuò)誤：{str(e)}")
            continue

    return str(soup)

def download_article(url, article_title):
    try:
        response = requests.get(url, headers=HEADERS, timeout=30)
        soup = BeautifulSoup(response.text, 'html.parser')
        content = soup.select_one("article")
        if not content:
            print(f" 未找到文章內(nèi)容")
            return None
        processed_content = process_images_in_content(str(content), article_title)
        markdown_content = md(processed_content)
        return markdown_content
    except Exception as e:
        print(f" 下載文章時(shí)出錯(cuò)：{str(e)}")
        return None

def save_to_markdown(title, content, save_dir: Path):
    save_dir.mkdir(parents=True, exist_ok=True)
    safe_title = sanitize_filename(title)
    filename = save_dir / f"{safe_title}.md"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"# {title}\n\n")
        f.write(content)
    print(f" 已保存：{filename}")
    return filename

# ========== 主程序入口 ==========
if __name__ == "__main__":
    print("開(kāi)始獲取文章列表...")
    articles = get_article_list(CSDN_USERNAME, min_date=MIN_PUBLISH_DATE)
    print(f"找到 {len(articles)} 篇符合條件的文章（發(fā)布日期 ≥ {MIN_PUBLISH_DATE.strftime('%Y-%m-%d')}）")

    success_count = 0
    fail_count = 0

    for i, article in enumerate(articles, 1):
        title = article["title"]
        url = article["url"]
        pub_date = article.get("publish_date")
        date_str = pub_date.strftime("%Y-%m-%d") if pub_date else "未知"
        print(f"\n[{i}/{len(articles)}] 處理文章：{title} （發(fā)布于 {date_str}）")

        content = download_article(url, title)
        if content:
            save_to_markdown(title, content, SAVE_DIR)
            success_count += 1
        else:
            print(f" 文章下載失?。簕title}")
            fail_count += 1

        time.sleep(2)

    print(f"\n處理完成！成功：{success_count}篇，失?。簕fail_count}篇")
    print(f"文章保存在：{SAVE_DIR.resolve()}")
    print("圖片保存在：./images/ 目錄下，Markdown文件可離線查看")

記得修改MIN_PUBLISH_DATE 里的開(kāi)始日期。之后執(zhí)行腳本后，下載下來(lái)就是指定日期后的文章了。

到此這篇關(guān)于Python腳本實(shí)現(xiàn)批量導(dǎo)出網(wǎng)站指定文章的文章就介紹到這了,更多相關(guān)Python導(dǎo)出網(wǎng)站文章內(nèi)容請(qǐng)搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家！

您可能感興趣的文章: