Python使用pypandoc將markdown文件和LaTex公式轉(zhuǎn)為word

更新時(shí)間：2025年04月03日 10:02:45 作者：風(fēng)暴之零

pypandoc 是一個(gè)用于 pandoc 的輕量級(jí) Python 包裝器,支持多種格式的文檔轉(zhuǎn)換,下面我們來(lái)看看如何使用pypandoc將markdown文件和LaTex公式轉(zhuǎn)為word吧

一 pypandoc 介紹

1. 項(xiàng)目介紹

pypandoc 是一個(gè)用于 pandoc 的輕量級(jí) Python 包裝器。pandoc 是一個(gè)通用的文檔轉(zhuǎn)換工具，支持多種格式的文檔轉(zhuǎn)換，如 Markdown、HTML、LaTeX、DocBook 等。pypandoc 通過(guò)提供一個(gè)簡(jiǎn)單的 Python 接口，使得在 Python 腳本中調(diào)用 pandoc 變得更加方便。

2. 安裝

使用pip安裝

pip install pypandoc_binary

自動(dòng)下載 Pandoc并安裝

注意：pypandoc 提供了兩個(gè)包：

pypandoc：需要用戶自行安裝 pandoc軟件才能使用。

pypandoc_binary：包含了預(yù)編譯的 pandoc 二進(jìn)制文件，方便用戶快速上手。

手動(dòng)安裝

可以手動(dòng)安裝pandoc再安裝pypandoc庫(kù)

pip install pypandoc

也可以先安裝pypandoc然后再在pyhon中運(yùn)行 pypandoc.download_pandoc()函數(shù)自動(dòng)下載并安裝 Pandoc，將其存放在 pypandoc 可以訪問(wèn)的目錄中。

二、使用Python 將markdown轉(zhuǎn)Word

本腳本實(shí)現(xiàn)了三類功能

1、將markdown文件轉(zhuǎn)為word文件

2、將 markdown中段落開(kāi)頭的“-“轉(zhuǎn)為回車，避免渲染成黑點(diǎn)或者空心圓等Word中不常見(jiàn)的符號(hào)

3、自定義了模板，格式化輸出。

import pypandoc
import time
import re

# 定義路徑
path1 = r"md.md"
path2 = r".docx"
template_path = r"D:\aTools\ytemplates\templates_s.docx"

# 讀取原始Markdown文件內(nèi)容
with open(path1, 'r', encoding='utf-8') as file:
    content = file.read()

# 使用正則表達(dá)式將以'- '開(kāi)頭的部分替換為換行符
processed_content = re.sub(r'- ', '\n', content)

# 記錄開(kāi)始時(shí)間
t1 = time.time()

# 將處理后的內(nèi)容轉(zhuǎn)換為Word文檔
pypandoc.convert_text(
    processed_content,
    'docx',
    format='md',
    outputfile=path2,
    extra_args=['--reference-doc', template_path]
)

# 打印耗時(shí)
print(time.time() - t1)
print("轉(zhuǎn)換完成！")

三、直接指定Word格式

直接讀取文件（可以為txt或者md)轉(zhuǎn)為指定格式的word。

這里格式是：

1、將 markdown中段落開(kāi)頭的“-“轉(zhuǎn)為回車，避免渲染成黑點(diǎn)或者空心圓等Word中不常見(jiàn)的符號(hào)

2、將原來(lái)加粗部分繼續(xù)加粗和左對(duì)齊

3、字體為黑色GB2312

注意：代碼用正則替換####這些時(shí)需要先從4級(jí)標(biāo)題開(kāi)始替換否則會(huì)有邏輯錯(cuò)誤，導(dǎo)致奇數(shù)個(gè)#無(wú)法替換。

設(shè)置中文字體不能用run.font.name = '仿宋_GB2312’而是用style._element.rPr.rFonts.set(qn(‘w:eastAsia’), ‘仿宋_GB2312’) 設(shè)置中文字體。

import re
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn

def set_font_color(run):
    run.font.name = 'Times New Roman'
    run._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋_GB2312')
    run.font.size = Pt(12)
    run.font.color.rgb = RGBColor(0, 0, 0)
    run.italic = False

def process_content(line, paragraph):
    """通用內(nèi)容處理函數(shù)"""
    bold_pattern = re.compile(r'\*\*(.*?)\*\*')
    matches = list(bold_pattern.finditer(line))
    
    if not matches:
        run = paragraph.add_run(line)
        set_font_color(run)
    else:
        start = 0
        for match in matches:
            if match.start() > start:
                run = paragraph.add_run(line[start:match.start()])
                set_font_color(run)
            run = paragraph.add_run(match.group(1))
            run.bold = True
            set_font_color(run)
            start = match.end()
        if start < len(line):
            run = paragraph.add_run(line[start:])
            set_font_color(run)

def mdtxt2word(txt_path, docx_path):
    with open(txt_path, 'r', encoding='utf-8') as file:
        content = re.sub(r'- ', '\n', file.read())

    doc = Document()
    style = doc.styles['Normal']
    style.font.name = 'Times New Roman'
    style._element.rPr.rFonts.set(qn('w:eastAsia'), '仿宋_GB2312')
    style.font.size = Pt(12)
    style.font.color.rgb = RGBColor(0, 0, 0)

    # 合并標(biāo)題正則表達(dá)式
    heading_pattern = re.compile(
        r'^\s*(#{1,4})\s*(.*?)\s*$'  # 匹配1-4個(gè)#開(kāi)頭的標(biāo)題
    )

    for line in content.split('\n'):
        # 處理所有標(biāo)題類型
        heading_match = heading_pattern.match(line)
        if heading_match:
            level = len(heading_match.group(1))  # 根據(jù)#數(shù)量確定級(jí)別
            title_text = heading_match.group(2).strip()
            
            if not title_text:
                continue  # 跳過(guò)空標(biāo)題

            # 創(chuàng)建對(duì)應(yīng)級(jí)別的標(biāo)題
            heading = doc.add_heading(level=min(level, 4))  # 限制最大4級(jí)
            heading.alignment = WD_ALIGN_PARAGRAPH.LEFT
            
            # 處理標(biāo)題內(nèi)容中的加粗標(biāo)記
            process_content(title_text, heading)
            continue

        # 處理普通段落
        paragraph = doc.add_paragraph()
        paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
        process_content(line, paragraph)

    doc.save(docx_path)
    print("轉(zhuǎn)換完成！")

if __name__ == "__main__":
    txt_path = r"C:\Users\xueshifeng\Desktop\11.txt"
    docx_path = r"C:\Users\xueshifeng\Desktop\11.docx"
    mdtxt2word(txt_path, docx_path)

四、將LaTex公式轉(zhuǎn)為Word

將 latex_content字符串$ $ 中間的位置替換為公式，或者直接復(fù)制代碼到GPT，讓GPT修改代碼

import pypandoc

# 定義包含特定公式的LaTeX字符串
#$ $ 中間的位置替換為公式，或者直接復(fù)制代碼到GPT，讓GPT生成最終代碼
latex_content = r"""
\documentclass{article}
\usepackage{amsmath} % 確保包含用于數(shù)學(xué)排版的包
\begin{document}

$ L(y_i, f(x_i)) = \max(0, 1 - y_if(x_i)) $


\end{document}
"""

# 將LaTeX內(nèi)容轉(zhuǎn)換為Word文檔
output_file = r"xx14.docx"

output = pypandoc.convert_text(
    latex_content,  # 輸入的字符串
    'docx',         # 輸出格式
    format='latex', # 輸入格式（LaTeX）
    outputfile=output_file,  # 輸出文件路徑
    extra_args=['--mathml']  # 額外參數(shù)，確保公式渲染為MathML格式
)

# 檢查轉(zhuǎn)換是否成功
if output != '':
    print(f"轉(zhuǎn)換過(guò)程中出現(xiàn)錯(cuò)誤: {output}")
else:
    print(f"Word 文檔已生成: {output_file}")

四、將LaTex公式轉(zhuǎn)為Word,追加寫入Word

難點(diǎn)在于如何管理文件句柄，沒(méi)有找到好方法，采用對(duì)已打開(kāi)的文檔，先關(guān)閉再打開(kāi)的方法。

import os
import pypandoc
from docx import Document
import tempfile
import time
import pythoncom
from win32com.client import Dispatch  # 需要安裝pywin32庫(kù)

def is_file_locked(filepath):
    try:
        with open(filepath, 'a'):
            return False
    except PermissionError:
        return True
    except FileNotFoundError:
        return False

def close_word_document(filepath):
    try:
        word = Dispatch("Word.Application")
        for doc in word.Documents:
            if doc.FullName.lower() == os.path.abspath(filepath).lower():
                doc.Save()
                doc.Close()
                print("已保存并關(guān)閉Word文檔")
                return True
        word.Quit()
    except Exception as e:
        print(f"關(guān)閉Word文檔失?。簕str(e)}")
    return False

def generate_latex_content(formula):
    """生成完整的LaTeX文檔內(nèi)容"""
    return fr"""
    \documentclass{{article}}
    \usepackage{{amsmath}}
    \begin{{document}}

    開(kāi)始：

    ${formula}$

    結(jié)束。
    \end{{document}}
    
    """
    
def doc_creat(user_formula, output_file):


    # 檢查文件是否存在
    if not os.path.exists(output_file):
        # 創(chuàng)建新文檔對(duì)象
        doc = Document()
        # 保存文檔
        doc.save(output_file)
        print(f"文件已創(chuàng)建：{output_file}")
        document = Document(output_file)
    else:
        print("文件已打開(kāi)")
        
    
    
    retry_count = 3
    for _ in range(retry_count):
        if is_file_locked(output_file):
            print("檢測(cè)到文件被占用，嘗試關(guān)閉Word文檔...")
            if close_word_document(output_file):
                time.sleep(0.5)  # 等待系統(tǒng)釋放文件
                continue
            else:
                print("錯(cuò)誤：文件被其他程序占用，請(qǐng)手動(dòng)關(guān)閉后重試！")
                break

        try:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".tex") as temp_tex_file:
                latex_content = generate_latex_content(user_formula)
                temp_tex_file.write(latex_content.encode('utf-8'))
                temp_tex_file_name = temp_tex_file.name

            with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_docx_file:
                temp_docx_file_name = temp_docx_file.name

            # 轉(zhuǎn)換LaTeX到Word
            pypandoc.convert_file(
                temp_tex_file_name, 'docx', 
                outputfile=temp_docx_file_name, extra_args=['--mathjax']
            )

            # 創(chuàng)建或打開(kāi)目標(biāo)文檔
            target_doc = Document(output_file) if os.path.exists(output_file) else Document()
            temp_doc = Document(temp_docx_file_name)
            
            # 復(fù)制所有元素
            for element in temp_doc.element.body:
                target_doc.element.body.append(element)
            
            # 保存目標(biāo)文檔
            target_doc.save(output_file)
            print(f"內(nèi)容已成功追加至：{output_file}")
            
            # 自動(dòng)用Word打開(kāi)文檔
            os.startfile(output_file)
            break

        except PermissionError:
            print("文件權(quán)限錯(cuò)誤，請(qǐng)檢查文件是否被其他程序占用")
            break
        except Exception as e:
            print(f"操作失?。簕str(e)}")
            break
        finally:
            if 'temp_tex_file_name' in locals() and os.path.exists(temp_tex_file_name):
                os.unlink(temp_tex_file_name)
            if 'temp_docx_file_name' in locals() and os.path.exists(temp_docx_file_name):
                os.unlink(temp_docx_file_name)
    else:
        print("重試次數(shù)已達(dá)上限，請(qǐng)檢查文件狀態(tài)")

if __name__ == '__main__':
    # 用戶輸入公式（示例）
    user_formula = r"\frac{\sqrt{x^2 + y^2}}{z}"   
    # 輸出文件路徑
    output_file = r"C:\Users\xueshifeng\Desktop\18.docx"
    
    doc_creat(user_formula, output_file)

以上就是Python使用pypandoc將markdown文件和LaTex公式轉(zhuǎn)為word的詳細(xì)內(nèi)容，更多關(guān)于Python pypandoc格式轉(zhuǎn)換的資料請(qǐng)關(guān)注腳本之家其它相關(guān)文章！

您可能感興趣的文章: