Java實現(xiàn)word,pdf轉html并保留格式
更新時間:2025年07月16日 10:06:45 作者:xyyf
這篇文章主要為大家詳細介紹了如何使用Java實現(xiàn)將word,pdf轉換為html并保留格式,文中的示例代碼講解詳細,感興趣的小伙伴可以了解下
一、word轉html
依賴:
<properties>
<poi.version>5.2.3</poi.version>
<xhtml.version>2.0.4</xhtml.version>
</properties>
<!--word轉html-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>${poi.version}</version>
</dependency>
<!--word轉html-->
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
<version>${xhtml.version}</version>
</dependency>
<!--處理office文檔表格相關 2007+版-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>${poi.version}</version>
</dependency>
<!--處理office文檔表格相關 2003版-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>${poi.version}</version>
</dependency>代碼:
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.codec.binary.Base64;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.net.URL;
public class WordUtil {
public static String wordToHtml(String fileUrl,String fileSuffix) throws Exception {
URL url = new URL(fileUrl);
try (InputStream inputStream = url.openStream()) {
if(fileSuffix.equals(".docx") || fileSuffix.equals(".DOCX")){
return word2007ToHtml(inputStream);
} else if (fileSuffix.equals(".doc") || fileSuffix.equals(".DOC")) {
return word2003ToHtml(inputStream);
}else{
throw new RuntimeException("錯誤的文件后綴");
}
} catch (RuntimeException e) {
throw new RuntimeException(e.getMessage());
}
}
/**
* word2007轉換成html
* 對于docx,可以用下面這種方式:
* @throws Exception
*/
public static String word2007ToHtml(InputStream inputStream) {
try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
XWPFDocument docxDocument = new XWPFDocument(inputStream)) {
XHTMLOptions options = XHTMLOptions.create();
// 是否忽略未使用的樣式
options.setIgnoreStylesIfUnused(false);
// 設置片段模式,<div>標簽包裹
options.setFragment(true);
// 圖片轉base64
options.setImageManager(new Base64EmbedImgManager());
// 轉換htm1
XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
return htmlStream.toString();
} catch (Exception e) {
System.out.println("Word轉Html過程出現(xiàn)異常!");
throw new RuntimeException(e.getMessage());
}
}
/**
* word2003轉換成html
* 對于doc,可以用下面這種方式:
* @throws Exception
*/
public static String word2003ToHtml(InputStream inputStream ) throws Exception {
try (StringWriter writer = new StringWriter();
HWPFDocument document = new HWPFDocument(inputStream)) {
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
//將圖片轉成base64的格式
wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));
wordToHtmlConverter.processDocument(document);
org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, new StreamResult(writer));
return writer.toString();
} catch (Exception e) {
System.out.println("Word轉Html過程出現(xiàn)異常!");
throw new RuntimeException(e.getMessage());
}
}
}二、pdf轉html
依賴:
<dependency>
<groupId>net.sf.cssbox</groupId>
<artifactId>pdf2dom</artifactId>
</dependency>
<dependency>
<groupId>net.mabboud.fontverter</groupId>
<artifactId>FontVerter</artifactId>
</dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
</dependency>
<!--pdf轉文本-->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
</dependency>代碼:
import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;
import java.io.*;
import java.net.URL;
public class PDFUtil {
public static String pdfToHtml(String fileUrl) throws IOException {
URL url = new URL(fileUrl);
try (InputStream inputStream = url.openStream()){
return pdfToHtml(inputStream);
}catch (Exception e){
throw new IOException(e.getMessage());
}
}
public static String pdfToHtml(InputStream inputStream) throws IOException {
String outFilePath = "mypdf.html";
String pdfContent = "";
PDDocument document = PDDocument.load(inputStream);
Writer writer = new PrintWriter(outFilePath, "UTF-8");
new PDFDomTree().writeText(document, writer);
writer.close();
document.close();
// 獲取html內容
try (BufferedReader reader = new BufferedReader(new FileReader(outFilePath))) {
StringBuilder htmlContent = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
htmlContent.append(line).append("\n"); // 追加每一行內容,并添加換行符
}
pdfContent = String.valueOf(htmlContent);
return pdfContent;
} catch (IOException e) {
e.printStackTrace();
System.err.println("讀取 HTML 文件時出錯。");
}
return null;
}
}三、方法補充
Java實現(xiàn)word轉html
1.引入maven依賴
<properties>
<poi.version>5.2.3</poi.version>
<xhtml.version>2.0.4</xhtml.version>
</properties>
<!--word轉html-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>${poi.version}</version>
</dependency>
<!--word轉html-->
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
<version>${xhtml.version}</version>
</dependency>
<!--處理office文檔表格相關 2007+版-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>${poi.version}</version>
</dependency>
<!--處理office文檔表格相關 2003版-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>${poi.version}</version>
</dependency>
2.Java代碼
/**
* Word2007(docx)格式轉html
* @param filePath 文件路徑
* @return 返回轉成String類型的html字符串
* @throws IOException
*/
public static String docxToHtml(String filePath) {
try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
XWPFDocument docxDocument = new XWPFDocument(Files.newInputStream(Paths.get(filePath)))) {
XHTMLOptions options = XHTMLOptions.create();
// 是否忽略未使用的樣式
options.setIgnoreStylesIfUnused(false);
// 設置片段模式,<div>標簽包裹
options.setFragment(true);
// 圖片轉base64
options.setImageManager(new Base64EmbedImgManager());
// 轉換htm1
XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
return htmlStream.toString();
} catch (Exception e) {
log.error("Word轉Html過程出現(xiàn)異常!", e);
}
return null;
}
/**
* Word2003(doc)格式轉html
* @param filePath 文件路徑
* @return 返回轉成String類型的html字符串
* @throws Exception
*/
public static String docToHtml(String filePath) {
try (StringWriter writer = new StringWriter();
HWPFDocument document = new HWPFDocument(Files.newInputStream(new File(filePath).toPath()))) {
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
//將圖片轉成base64的格式
wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));
wordToHtmlConverter.processDocument(document);
org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, new StreamResult(writer));
return writer.toString();
} catch (Exception e) {
log.error("Word轉Html過程出現(xiàn)異常!", e);
}
return null;
}
/**
* word 轉 html
* 自動檢測文件格式轉換
* @param filePath 文件本地路徑
* @return 成功返回轉換后的html字符串;失敗返回null
*/
public static String autoWord2Html(String filePath) {
int lastIndexOf = filePath.lastIndexOf(".");
String suffix = filePath.substring(lastIndexOf + 1);
if ("doc".equalsIgnoreCase(suffix)) {
return docToHtml(filePath);
} else if ("docx".equalsIgnoreCase(suffix)) {
return docxToHtml(filePath);
} else {
log.info("文件格式錯誤,只支持Docx和Doc格式的文檔!");
return null;
}
}
使用Java實現(xiàn)PDF到HTML的轉換
引入以下依賴
<dependency>
<groupId>net.sf.cssbox</groupId>
<artifactId>pdf2dom</artifactId>
<version>2.0.3</version>
</dependency>
<dependency>
<groupId>net.mabboud.fontverter</groupId>
<artifactId>FontVerter</artifactId>
<version>1.2.22</version> <!-- 請根據(jù)需要使用最新版本 -->
</dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
<version>0.10.2</version> <!-- 請根據(jù)需要使用最新版本 -->
</dependency>
<!--pdf轉文本-->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.24</version>
</dependency>實現(xiàn)關鍵代碼
File file = new File(pdfUrl);
String localPdfFilePath = 要解析的PDF文件路徑(本地)+ file.getName();
String newPdfFilePath = 截取PDF后生成的PDF文件路徑+ file.getName();
String outFilePath = 生成的HTML文件.html";
String pdfContent = "";
PDDocument pdfDocument = PDDocument.load(new File(localPdfFilePath));
// 檢查文檔中是否有頁面
if (pdfDocument.getNumberOfPages() > 0) {
// 移除第一頁
pdfDocument.removePage(0);
}
// 保存更改后的PDF到新文件
pdfDocument.save(new File(newPdfFilePath));
System.out.println("第一頁已被移除,新PDF保存在: " + newPdfFilePath);
pdfDocument.close();
// 轉換成html格式文件
PDDocument document = PDDocument.load(new File(newPdfFilePath));
Writer writer = new PrintWriter(outFilePath, "UTF-8");
new PDFDomTree().writeText(document, writer);
writer.close();
document.close();
// 獲取html內容
try (BufferedReader reader = new BufferedReader(new FileReader(outFilePath))) {
StringBuilder htmlContent = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
htmlContent.append(line).append("\n"); // 追加每一行內容,并添加換行符
}
pdfContent = String.valueOf(htmlContent);
} catch (IOException e) {
e.printStackTrace();
System.err.println("讀取 HTML 文件時出錯。");
}到此這篇關于Java實現(xiàn)word,pdf轉html并保留格式的文章就介紹到這了,更多相關Java word,pdf轉html內容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關文章希望大家以后多多支持腳本之家!
相關文章
Java docx4j高效處理Word文檔的實戰(zhàn)指南
對于需要在Java應用程序中生成、修改或處理Word文檔的開發(fā)者來說,docx4j是一個強大而專業(yè)的選擇,下面我們就來看看docx4j的具體使用吧2025-07-07
java中l(wèi)ombok的@Data引發(fā)問題詳解
這篇文章主要給大家介紹了關于java中l(wèi)ombok的@Data引發(fā)問題的相關資料,文中通過圖文介紹的非常詳細,對大家的學習或者工作具有一定的參考學習價值,需要的朋友們下面隨著小編來一起學習學習吧2020-09-09
JVM性能調優(yōu)實戰(zhàn):讓你的IntelliJ Idea縱享絲滑
這篇文章主要介紹了JVM性能調優(yōu)實戰(zhàn):讓你的IntelliJ Idea縱享絲滑的相關資料,本文給大家介紹的非常詳細,對大家的學習或工作具有一定的參考借鑒價值,需要的朋友可以參考下2021-01-01
Java基于LoadingCache實現(xiàn)本地緩存的示例代碼
本文主要介紹了Java基于LoadingCache實現(xiàn)本地緩存的示例代碼,文中通過示例代碼介紹的非常詳細,具有一定的參考價值,感興趣的小伙伴們可以參考一下2022-01-01

