Python腳本實(shí)現(xiàn)批量導(dǎo)出網(wǎng)站指定文章
一 導(dǎo)出全部已發(fā)布文章
首先,需要在本地安裝3.8版本以上的python,安裝python步驟
檢查是否安裝成功
pip3 --version
安裝后執(zhí)行
pip3 install requests beautifulsoup4 markdownify
新建腳本,腳本名字隨意,這里是:csdn_downloader.py
腳本內(nèi)容如下:
# -*- coding: utf-8 -*-
import os
import re
import requests
import time
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from urllib.parse import urlparse, unquote
import hashlib
from pathlib import Path
# ================== 配置區(qū) ==================
CSDN_USERNAME = "qq_33417321" # ←←← 修改為你想下載的用戶名
SAVE_DIR = Path("csdn_articles") # 文章保存根目錄(自動(dòng)跨平臺(tái))
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
"Referer": "https://blog.csdn.net/"
}
def sanitize_filename(name: str) -> str:
"""清理文件名,移除 Windows 非法字符和‘原創(chuàng)'字樣"""
name = name.replace("原創(chuàng)", "").strip()
# 移除 Windows 非法字符
name = re.sub(r'[\\/*?:"<>|\r\n]', "_", name)
return name or "untitled"
def get_article_list(username):
"""獲取博主文章列表(標(biāo)題和URL)"""
url = f"https://blog.csdn.net/{username}/article/list"
articles = []
page = 1
while True:
response = requests.get(f"{url}/{page}", headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.select(".article-list .article-item-box")
if not items:
break
for item in items:
title_elem = item.select_one("h4 a")
if not title_elem:
continue
title = title_elem.text.strip()
link = title_elem["href"]
articles.append({"title": title, "url": link})
page += 1
time.sleep(1)
return articles
def download_image(img_url, save_path: Path):
"""下載單張圖片到本地"""
try:
img_headers = HEADERS.copy()
img_headers["Accept"] = "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8"
response = requests.get(img_url, headers=img_headers, stream=True, timeout=30)
if response.status_code == 200:
save_path.parent.mkdir(parents=True, exist_ok=True)
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return True
else:
print(f"圖片下載失?。顟B(tài)碼:{response.status_code}):{img_url}")
return False
except Exception as e:
print(f"圖片下載異常:{img_url},錯(cuò)誤:{str(e)}")
return False
def get_image_extension(img_url):
"""從URL中獲取圖片擴(kuò)展名"""
parsed_url = urlparse(img_url)
path = parsed_url.path.lower()
extensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg']
for ext in extensions:
if ext in path:
return ext
return '.jpg'
def process_images_in_content(content, article_title):
"""處理內(nèi)容中的圖片,下載并替換為本地路徑"""
soup = BeautifulSoup(content, 'html.parser')
img_tags = soup.find_all('img')
if not img_tags:
return content
# 清理文章標(biāo)題用于路徑
safe_title = sanitize_filename(article_title)
global_image_dir = SAVE_DIR / "images"
article_image_dir = global_image_dir / safe_title
for img in img_tags:
img_url = img.get('src', '')
if not img_url:
continue
# 處理協(xié)議相對(duì)路徑
if img_url.startswith('//'):
img_url = 'https:' + img_url
elif not img_url.startswith(('http://', 'https://')):
continue # 跳過(guò)無(wú)法處理的相對(duì)路徑
try:
img_hash = hashlib.md5(img_url.encode()).hexdigest()[:8]
img_ext = get_image_extension(img_url)
img_filename = f"{img_hash}{img_ext}"
local_img_path = article_image_dir / img_filename
# Markdown 中使用正斜杠(/),兼容所有平臺(tái)
md_img_path = f"./images/{safe_title}/{img_filename}"
if not local_img_path.exists():
print(f" 下載圖片:{img_filename}")
if download_image(img_url, local_img_path):
img['src'] = md_img_path
else:
print(f" 圖片下載失敗,保留原鏈接:{img_url}")
else:
img['src'] = md_img_path
except Exception as e:
print(f" 處理圖片時(shí)出錯(cuò):{img_url},錯(cuò)誤:{str(e)}")
continue
return str(soup)
def download_article(url, article_title):
"""下載單篇文章,處理圖片后轉(zhuǎn)為Markdown"""
try:
response = requests.get(url, headers=HEADERS, timeout=30)
soup = BeautifulSoup(response.text, 'html.parser')
content = soup.select_one("article")
if not content:
print(f" 未找到文章內(nèi)容")
return None
processed_content = process_images_in_content(str(content), article_title)
markdown_content = md(processed_content)
return markdown_content
except Exception as e:
print(f" 下載文章時(shí)出錯(cuò):{str(e)}")
return None
def save_to_markdown(title, content, save_dir: Path):
"""保存Markdown文件"""
save_dir.mkdir(parents=True, exist_ok=True)
safe_title = sanitize_filename(title)
filename = save_dir / f"{safe_title}.md"
with open(filename, "w", encoding="utf-8") as f:
f.write(f"# {title}\n\n")
f.write(content)
print(f" 已保存:{filename}")
return filename
if __name__ == "__main__":
print("開(kāi)始獲取文章列表...")
articles = get_article_list(CSDN_USERNAME)
print(f"找到 {len(articles)} 篇文章")
success_count = 0
fail_count = 0
for i, article in enumerate(articles, 1):
title = article["title"]
url = article["url"]
print(f"\n[{i}/{len(articles)}] 處理文章:{title}")
content = download_article(url, title)
if content:
save_to_markdown(title, content, SAVE_DIR)
success_count += 1
else:
print(f" 文章下載失?。簕title}")
fail_count += 1
time.sleep(2)
print(f"\n處理完成!成功:{success_count}篇,失敗:{fail_count}篇")
print(f"文章保存在:{SAVE_DIR.resolve()}")
print("圖片保存在:./images/ 目錄下,Markdown文件可離線查看")
其中,腳本里CSDN_USERNAME的值,改為你要獲取的csdn的用戶名
獲取用戶名:點(diǎn)擊作者頭像后,鏈接里的這個(gè)值就是用戶名(紅框里的內(nèi)容)

執(zhí)行腳本
python csdn_downloader.py
執(zhí)行日志入下:

由于要下載csdn文章里的圖片,所以很慢,靜靜等待即可。下載過(guò)程中,會(huì)在腳本所在目錄生成一個(gè)csdn_articles文件夾,里邊是md文件以及存md里的圖片的文件夾。

二 導(dǎo)出指定日期后的文章
上邊的腳本,一次性導(dǎo)出了所有已發(fā)布的文章,但是有時(shí)候我們的文章太多,每次備份不需要全部導(dǎo)出,只導(dǎo)出指定時(shí)間以后的文章,那么使用如下腳本即可,如腳本名為new.py,內(nèi)容如下
# -*- coding: utf-8 -*-
import os
import re
import requests
import time
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from urllib.parse import urlparse, unquote
import hashlib
from pathlib import Path
from datetime import datetime
# ================== 配置區(qū) ==================
CSDN_USERNAME = "qq_33417321" # ←←← 修改為你想下載的用戶名
SAVE_DIR = Path("csdn_articles") # 文章保存根目錄(自動(dòng)跨平臺(tái))
# ?? 新增:設(shè)置最小發(fā)布日期(含)
MIN_PUBLISH_DATE = datetime(2026, 2, 7) # 只下載 2026年2月1日及之后的文章
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
"Referer": "https://blog.csdn.net/"
}
def sanitize_filename(name: str) -> str:
"""清理文件名,移除 Windows 非法字符和‘原創(chuàng)'字樣"""
name = name.replace("原創(chuàng)", "").strip()
name = re.sub(r'[\\/*?:"<>|\r\n]', "_", name)
return name or "untitled"
def parse_publish_date(date_str: str) -> datetime | None:
"""嘗試解析 CSDN 日期字符串,支持 '2025-06-15 10:30:00' 或 '2025-06-15' 等"""
date_str = date_str.strip()
for fmt in [
"%Y-%m-%d %H:%M:%S", # 帶秒,如 2025-06-15 10:30:09
"%Y-%m-%d %H:%M", # 不帶秒,如 2025-06-15 10:30
"%Y-%m-%d" # 只有日期,如 2025-06-15
]:
try:
return datetime.strptime(date_str, fmt)
except ValueError:
continue
print(f" 無(wú)法解析日期:{date_str}")
return None
def get_article_list(username, min_date=None):
"""獲取博主文章列表(標(biāo)題、URL、發(fā)布時(shí)間),可選按 min_date 過(guò)濾"""
url = f"https://blog.csdn.net/{username}/article/list"
articles = []
page = 1
early_stop = False
while not early_stop:
print(f" 正在抓取第 {page} 頁(yè)...")
response = requests.get(f"{url}/{page}", headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.select(".article-list .article-item-box")
if not items:
break
current_page_has_valid = False # 當(dāng)前頁(yè)是否有滿足條件的文章
for item in items:
title_elem = item.select_one("h4 a")
date_elem = item.select_one(".date") # CSDN 通常用 .date 類(lèi)表示發(fā)布時(shí)間
if not title_elem:
continue
title = title_elem.text.strip()
link = title_elem["href"]
pub_date = None
if date_elem:
raw_date = date_elem.text.strip()
pub_date = parse_publish_date(raw_date)
# 如果設(shè)置了最小日期,且文章發(fā)布時(shí)間早于該日期,則跳過(guò)
if min_date and pub_date and pub_date < min_date:
continue
# 如果發(fā)布時(shí)間未知但設(shè)置了 min_date,保守起見(jiàn)也跳過(guò)(或可選擇保留)
if min_date and not pub_date:
print(f" 警告:無(wú)法獲取文章 [{title}] 的發(fā)布時(shí)間,跳過(guò)(因設(shè)置了日期過(guò)濾)")
continue
articles.append({
"title": title,
"url": link,
"publish_date": pub_date
})
current_page_has_valid = True
# 如果當(dāng)前頁(yè)沒(méi)有任何有效文章(全部早于 min_date),可提前終止
if min_date and not current_page_has_valid and page > 1:
print(" 后續(xù)頁(yè)面文章均早于指定日期,停止翻頁(yè)。")
early_stop = True
page += 1
time.sleep(1)
return articles
# ========== 以下函數(shù)保持不變 ==========
def download_image(img_url, save_path: Path):
try:
img_headers = HEADERS.copy()
img_headers["Accept"] = "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8"
response = requests.get(img_url, headers=img_headers, stream=True, timeout=30)
if response.status_code == 200:
save_path.parent.mkdir(parents=True, exist_ok=True)
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return True
else:
print(f"圖片下載失?。顟B(tài)碼:{response.status_code}):{img_url}")
return False
except Exception as e:
print(f"圖片下載異常:{img_url},錯(cuò)誤:{str(e)}")
return False
def get_image_extension(img_url):
parsed_url = urlparse(img_url)
path = parsed_url.path.lower()
extensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg']
for ext in extensions:
if ext in path:
return ext
return '.jpg'
def process_images_in_content(content, article_title):
soup = BeautifulSoup(content, 'html.parser')
img_tags = soup.find_all('img')
if not img_tags:
return content
safe_title = sanitize_filename(article_title)
global_image_dir = SAVE_DIR / "images"
article_image_dir = global_image_dir / safe_title
for img in img_tags:
img_url = img.get('src', '')
if not img_url:
continue
if img_url.startswith('//'):
img_url = 'https:' + img_url
elif not img_url.startswith(('http://', 'https://')):
continue
try:
img_hash = hashlib.md5(img_url.encode()).hexdigest()[:8]
img_ext = get_image_extension(img_url)
img_filename = f"{img_hash}{img_ext}"
local_img_path = article_image_dir / img_filename
md_img_path = f"./images/{safe_title}/{img_filename}"
if not local_img_path.exists():
print(f" 下載圖片:{img_filename}")
if download_image(img_url, local_img_path):
img['src'] = md_img_path
else:
print(f" 圖片下載失敗,保留原鏈接:{img_url}")
else:
img['src'] = md_img_path
except Exception as e:
print(f" 處理圖片時(shí)出錯(cuò):{img_url},錯(cuò)誤:{str(e)}")
continue
return str(soup)
def download_article(url, article_title):
try:
response = requests.get(url, headers=HEADERS, timeout=30)
soup = BeautifulSoup(response.text, 'html.parser')
content = soup.select_one("article")
if not content:
print(f" 未找到文章內(nèi)容")
return None
processed_content = process_images_in_content(str(content), article_title)
markdown_content = md(processed_content)
return markdown_content
except Exception as e:
print(f" 下載文章時(shí)出錯(cuò):{str(e)}")
return None
def save_to_markdown(title, content, save_dir: Path):
save_dir.mkdir(parents=True, exist_ok=True)
safe_title = sanitize_filename(title)
filename = save_dir / f"{safe_title}.md"
with open(filename, "w", encoding="utf-8") as f:
f.write(f"# {title}\n\n")
f.write(content)
print(f" 已保存:{filename}")
return filename
# ========== 主程序入口 ==========
if __name__ == "__main__":
print("開(kāi)始獲取文章列表...")
articles = get_article_list(CSDN_USERNAME, min_date=MIN_PUBLISH_DATE)
print(f"找到 {len(articles)} 篇符合條件的文章(發(fā)布日期 ≥ {MIN_PUBLISH_DATE.strftime('%Y-%m-%d')})")
success_count = 0
fail_count = 0
for i, article in enumerate(articles, 1):
title = article["title"]
url = article["url"]
pub_date = article.get("publish_date")
date_str = pub_date.strftime("%Y-%m-%d") if pub_date else "未知"
print(f"\n[{i}/{len(articles)}] 處理文章:{title} (發(fā)布于 {date_str})")
content = download_article(url, title)
if content:
save_to_markdown(title, content, SAVE_DIR)
success_count += 1
else:
print(f" 文章下載失?。簕title}")
fail_count += 1
time.sleep(2)
print(f"\n處理完成!成功:{success_count}篇,失?。簕fail_count}篇")
print(f"文章保存在:{SAVE_DIR.resolve()}")
print("圖片保存在:./images/ 目錄下,Markdown文件可離線查看")
記得修改MIN_PUBLISH_DATE 里的開(kāi)始日期。之后執(zhí)行腳本后,下載下來(lái)就是指定日期后的文章了。
到此這篇關(guān)于Python腳本實(shí)現(xiàn)批量導(dǎo)出網(wǎng)站指定文章的文章就介紹到這了,更多相關(guān)Python導(dǎo)出網(wǎng)站文章內(nèi)容請(qǐng)搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家!
相關(guān)文章
使用Python實(shí)現(xiàn)圖像LBP特征提取的操作方法
LBP特征叫做局部二值模式,常用于紋理特征提取,并在紋理分類(lèi)中具有較強(qiáng)的區(qū)分能力,本文給大家介紹了如何使用Python實(shí)現(xiàn)圖像LBP特征提取的操作方法,需要的朋友可以參考下2025-04-04
Python+PyQt5開(kāi)發(fā)超全能的文件時(shí)間戳修改器
在日常開(kāi)發(fā)中,我們經(jīng)常需要批量修改文件的時(shí)間戳屬性,本文將使用Python的PyQt5庫(kù)開(kāi)發(fā)一個(gè)功能全面的圖形化文件時(shí)間戳編輯器,感興趣的小伙伴可以了解下2025-06-06
opencv導(dǎo)入頭文件時(shí)報(bào)錯(cuò)#include的解決方法
這篇文章主要介紹了opencv導(dǎo)入頭文件時(shí)報(bào)錯(cuò)#include的解決方法,文中通過(guò)示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來(lái)一起學(xué)習(xí)學(xué)習(xí)吧2019-07-07
Python設(shè)置Cookie永不超時(shí)的詳細(xì)指南
Cookie是一種存儲(chǔ)在用戶瀏覽器中的小型數(shù)據(jù)片段,用于記錄用戶的登錄狀態(tài)、偏好設(shè)置等信息,下面小編就來(lái)和大家詳細(xì)講講Python如何設(shè)置Cookie永不超時(shí)吧2025-07-07
對(duì)python 樹(shù)狀嵌套結(jié)構(gòu)的實(shí)現(xiàn)思路詳解
今天小編就為大家分享一篇對(duì)python 樹(shù)狀嵌套結(jié)構(gòu)的實(shí)現(xiàn)思路詳解,具有很好的參考價(jià)值,希望對(duì)大家有所幫助。一起跟隨小編過(guò)來(lái)看看吧2019-08-08

