使用jsou和Jacob将html代码转化成word文档，保留全部格式不乱码全代码

2024-03-24 16:29:04
废话不说，直接代码
Jacob有官网，下载完了把dll放在system32下，jsoup就简单的一个jar包
package com.sinosoft.util;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.jacob.activeX.ActiveXComponent;
import com.jacob.com.Dispatch;
import com.jacob.com.Variant;

/**
 * jsoup用来格式化html格式 jacob用来处理word和html相互转化,使用的是原生的office转化，转出的效果比较好
 */
public class WordJacob {

	// 8 代表word保存成html
	private final int WORD_HTML = 8;

	// 12 代表html保存成docx
	private final int HTML_WORD = 12;

	/*
	 * // 1 代表html保存成doc private final int HTML_DOC = 1;
	 */

	/**
	 * word保存路径
	 */
	private String wordPath;
	/**
	 * word文档名称，注意不要带文件后缀
	 */
	private String docName;

	public WordJacob(String wordPath, String docName) {
		this.wordPath = wordPath;
		this.docName = docName;
	}

	public static void main(String[] args) throws Exception {
		String wordPath = "D:/111/";
		//这个content就是读取了一个html的文件
		String content=ComFile.readTxtFile("d://report_sj_bn.html", "utf-8");
		String docName = "333";
		//把word阅览模式的头部添加到HTML代码
		String html = WordJacob.setHead(content);
		WordJacob word = new WordJacob(wordPath, docName);
		//解析html代码
		org.jsoup.nodes.Document document = Jsoup.parse(html);
		//格式化HTML代码，并且把图片文件提取出来（图片文件以base64的格式存储）
		word.convertReportHtml(document);
		//写html文件
		word.writehtml(wordPath + docName + ".html", document.toString());
		//把HTML和files文件转格式化成word
		word.htmlToWord();
	}
	/**
	 * WORD转HTML
	 * 
	 * @param docfile
	 *            WORD文件全路径
	 * @param htmlfile
	 *            转换后HTML存放路径
	 *            notes:需要将jacob.dll拷贝到windows/system32或者项目所在jre\bin目录下面(
	 *            比如我的Eclipse正在用的Jre路径是D:\Java\jdk1.7.0_17\jre\bin)。
	 * @param html
	 *            html静态页面路径
	 * @param wordFile
	 *            要生成的word文档路径
	 */
	public void wordToHtml() throws Exception{
		String docfile = wordPath + File.separator + docName + ".docx";
		String htmlfile = wordPath + File.separator + docName + ".html";
		// 启动word应用程序(Microsoft Office Word 2003)
		ActiveXComponent app = new ActiveXComponent("Word.Application");
		try {
			// 设置word应用程序不可见
			app.setProperty("Visible", new Variant(false));
			// documents表示word程序的所有文档窗口，（word是多文档应用程序）
			Dispatch docs = app.getProperty("Documents").toDispatch();
			// 打开要转换的word文件
			Dispatch doc = Dispatch.invoke(docs, "Open", Dispatch.Method,
					new Object[] { docfile, new Variant(false), new Variant(true) }, new int[1]).toDispatch();
			// 作为html格式保存到临时文件
			Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[] { htmlfile, new Variant(WORD_HTML) },
					new int[1]);
			// 关闭word文件
			Dispatch.call(doc, "Close", new Variant(false));
		} finally {
			// 关闭word应用程序
			app.invoke("Quit", new Variant[] {});
		}
	}

	/**
	 * 
	 * JACOB方式
	 * 
	 * notes:需要将jacob.dll拷贝到windows/system32或者项目所在jre\bin目录下面(
	 * 比如我的Eclipse正在用的Jre路径是D:\Java\jdk1.7.0_17\jre\bin)。
	 * 
	 * @param html
	 *            html静态页面路径
	 * 
	 * @param wordFile
	 *            要生成的word文档路径
	 * 
	 */

	public void htmlToWord() throws Exception{
		String wordFile = wordPath + File.separator + docName + ".docx";
		String htmlFile = wordPath + File.separator + docName + ".html";
		ActiveXComponent app = new ActiveXComponent("Word.Application"); // 启动word
		try {
			app.setProperty("Visible", new Variant(false));
			Dispatch wordDoc = app.getProperty("Documents").toDispatch();
			wordDoc = Dispatch.invoke(wordDoc, "Add", Dispatch.Method, new Object[0], new int[1]).toDispatch();
			Dispatch.invoke(app.getProperty("Selection").toDispatch(), "InsertFile", Dispatch.Method,
					new Object[] { htmlFile, "", new Variant(false), new Variant(false), new Variant(false) },
					new int[3]);
			Dispatch.invoke(wordDoc, "SaveAs", Dispatch.Method, new Object[] { wordFile, new Variant(HTML_WORD) },
					new int[1]);
			Dispatch.call(wordDoc, "Close", new Variant(false));
		} finally {
			app.invoke("Quit", new Variant[] {});
		}
	}

	/**
	 * 格式化html
	 * 
	 * @param element
	 * @param imgPath
	 *            图片路径
	 * @throws Exception
	 */
	public void convertReportHtml(Element element) throws Exception {
		if (element.id().equals("customerImg")) {
			element.remove();
			return;
		}

		String tagName = element.tagName();
		if (tagName.equals("table") && !"printTable".equals(element.id())) { // printTable
																				// 是需求版本输出Word时，用于布局的表格，不应该显示边框
			String style = element.attr("style");
			style += "width: 100%; border:1px solid #CCC; border-collapse:collapse;";
			element.attr("style", style);
		} else if (tagName.equals("th") && !"printTable".equals(element.parent().parent().parent().id())) {
			String style = element.attr("style");
			style += " border:1px solid #CCC;";
			element.attr("style", style);
		} else if (tagName.equals("td") && !"printTable".equals(element.parent().parent().parent().id())) {
			String style = element.attr("style");
			style += " border:1px solid #EEE;";
			element.attr("style", style);
		} else if (tagName.equals("img")) {
			String src = element.attr("src");
			if (src.startsWith("data:")) {
				// 此处用于生成图片文件
				String imgName = ComStr.generateShortUuid() + ".png";
				String newsrc = docName + ".files";
				String base64Str = src.split(",")[1];// 只取出base64部分的字符串
				saveWordImage(base64Str, imgName);
				element.attr("src", newsrc +"/"+ imgName);
			}
			if (src.contains("svg") && src.contains("version")) {// 如果highchart图表，先删除掉
				element.remove();
				return;
			}
			element.appendText("");
		} else if (tagName.equals("a")) {
			String href = element.attr("href");
			if (!href.startsWith("http")) {
				element.attr("href", href);
			}
		} else if (tagName.equals("meta")) {
			element.appendText("");
		} else if (tagName.equals("o:p")) {
			element.remove();
		}

		Elements elements = element.children();
		if (elements.isEmpty()) {
			return;
		}

		for (Element e : elements) {
			convertReportHtml(e);
		}
	}

	/**
	 * 保存word图片文件
	 * 
	 * @throws Exception
	 */
	private void saveWordImage(String base64Str, String filename) throws Exception {
		String path = wordPath + docName + ".files";
		File f = ImageBase64Converter.convertBase64ToFile(base64Str, path, filename);
		if (f == null) {
			throw new Exception("保存word文档图片失败!");
		}
	}

	/**
	 * 增加一个头部，用来转成word打开的时候是页面视图
	 * 
	 * @param html
	 * @return
	 */
	public static String setHead(String html) {
		// 添加一部分代码，用于打开的时候默认是word视图
		String head = "<!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN' 'http://www.w3.org/TR/html4/loose.dtd'>";
		head += "<html xmlns:v='urn:schemas-microsoft-com:vml' xmlns:o='urn:schemas-microsoft-com:office:office'";
		head += "xmlns:w='urn:schemas-microsoft-com:office:word' xmlns:m='http://schemas.microsoft.com/office/2004/12/omml'";
		head += "xmlns='http://www.w3.org/TR/REC-html40'><head>";
		head += "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />";
		head += "<meta name=ProgId  content=Word.Document ><meta name=Generator  content='Microsoft Word 14' ><meta name=Originator  content='Microsoft Word 14' >";
		head += "<!--[if gte mso 9]><xml><w:WordDocument><w:View>Print</w:View><w:TrackMoves>false</w:TrackMoves><w:TrackFormatting/><w:ValidateAgainstSchemas/><w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid><w:IgnoreMixedContent>false</w:IgnoreMixedContent><w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText><w:DoNotPromoteQF/><w:LidThemeOther>EN-US</w:LidThemeOther><w:LidThemeAsian>ZH-CN</w:LidThemeAsian><w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript><w:Compatibility><w:BreakWrappedTables/><w:SnapToGridInCell/><w:WrapTextWithPunct/><w:UseAsianBreakRules/><w:DontGrowAutofit/><w:SplitPgBreakAndParaMark/><w:DontVertAlignCellWithSp/><w:DontBreakConstrainedForcedTables/><w:DontVertAlignInTxbx/><w:Word11KerningPairs/><w:CachedColBalance/><w:UseFELayout/></w:Compatibility><w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel><m:mathPr><m:mathFont m:val='Cambria Math'/><m:brkBin m:val='before'/><m:brkBinSub m:val='--'/><m:smallFrac m:val='off'/><m:dispDef/><m:lMargin m:val='0'/> <m:rMargin m:val='0'/><m:defJc m:val='centerGroup'/><m:wrapIndent m:val='1440'/><m:intLim m:val='subSup'/><m:naryLim m:val='undOvr'/></m:mathPr></w:WordDocument></xml><![endif]-->";
		head += html;
		return head;
	}


	public void writehtml(String file, String content) throws Exception {
		FileOutputStream fos = null;
		OutputStreamWriter osw = null;
		try {
			fos = new FileOutputStream(file);
			osw = new OutputStreamWriter(fos, "utf-8");
			osw.write(content);
			osw.flush();
		} finally {
			if (osw != null) {
				osw.close();
			}
			if (fos != null) {
				fos.close();
			}
		}
	}
}
码农公寓

相关文章