introduction:
Recently, the company is working on an education, training, learning and online examination project. I am mainly engaged in the online course module, mainly doing the course classification, course, courseware creation and online learning and statistical functions, because courseware involves many types, such as video, audio, graphics and text, external links and document types. Among them, there is a problem, that is, the document courseware course is on the web page Display and learning problems, because online statistical learning courses, learning personnel, learning time is long, so we can't download documents to local learning as the traditional method, so it's not under the control of the system, so the final solution is that when uploading document courseware, the corresponding files are converted into HTML files, so that we can browse and learn on the web page
The following is mainly for the conversion of word,pdf and txt text files
1: Java implementation transforms word into html
1: introduce dependency
1 <dependency> 2 <groupId>fr.opensagres.xdocreport</groupId> 3 <artifactId>fr.opensagres.xdocreport.document</artifactId> 4 <version>1.0.5</version> 5 </dependency> 6 <dependency> 7 <groupId>fr.opensagres.xdocreport</groupId> 8 <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> 9 <version>1.0.5</version> 10 </dependency> 11 <dependency> 12 <groupId>org.apache.poi</groupId> 13 <artifactId>poi</artifactId> 14 <version>3.12</version> 15 </dependency> 16 <dependency> 17 <groupId>org.apache.poi</groupId> 18 <artifactId>poi-scratchpad</artifactId> 19 <version>3.12</version> 20 </dependency>
2: code demo
1 package com.svse.controller; 2 3 import javax.xml.parsers.DocumentBuilderFactory; 4 import javax.xml.parsers.ParserConfigurationException; 5 import javax.xml.transform.OutputKeys; 6 import javax.xml.transform.Transformer; 7 import javax.xml.transform.TransformerException; 8 import javax.xml.transform.TransformerFactory; 9 import javax.xml.transform.dom.DOMSource; 10 import javax.xml.transform.stream.StreamResult; 11 12 import org.apache.poi.hwpf.HWPFDocument; 13 import org.apache.poi.hwpf.converter.PicturesManager; 14 import org.apache.poi.hwpf.converter.WordToHtmlConverter; 15 import org.apache.poi.hwpf.usermodel.PictureType; 16 import org.apache.poi.xwpf.converter.core.BasicURIResolver; 17 import org.apache.poi.xwpf.converter.core.FileImageExtractor; 18 import org.apache.poi.xwpf.converter.core.FileURIResolver; 19 import org.apache.poi.xwpf.converter.core.IURIResolver; 20 import org.apache.poi.xwpf.converter.core.IXWPFConverter; 21 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter; 22 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; 23 import org.apache.poi.xwpf.usermodel.XWPFDocument; 24 /** 25 * word Convert to html 26 */ 27 public class TestWordToHtml { 28 29 public static final String STORAGEPATH="C://works//files//"; 30 public static final String IP="192.168.30.222"; 31 public static final String PORT="8010"; 32 public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException { 33 TestWordToHtml wt=new TestWordToHtml(); 34 //wt.Word2003ToHtml("Textual research on Oracle Bone Inscriptions.doc"); 35 wt.Word2007ToHtml("Textual research on Oracle Bone Inscriptions.docx"); 36 37 } 38 39 /** 40 * 2003 Version word to html 41 * @throws IOException 42 * @throws TransformerException 43 * @throws ParserConfigurationException 44 */ 45 public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException { 46 47 final String imagepath = STORAGEPATH+"fileImage/";//When parsing, if doc Pictures in the file will be saved in this path 48 final String strRanString=getRandomNum(); 49 String filepath =STORAGEPATH; 50 String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2003.html"; 51 final String file = filepath + fileName; 52 InputStream input = new FileInputStream(new File(file)); 53 HWPFDocument wordDocument = new HWPFDocument(input); 54 WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); 55 //Set where pictures are stored 56 wordToHtmlConverter.setPicturesManager(new PicturesManager() { 57 public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { 58 File imgPath = new File(imagepath); 59 if(!imgPath.exists()){//Create if picture directory does not exist 60 imgPath.mkdirs(); 61 } 62 63 File file = new File(imagepath +strRanString+suggestedName); 64 try { 65 OutputStream os = new FileOutputStream(file); 66 os.write(content); 67 os.close(); 68 } catch (FileNotFoundException e) { 69 e.printStackTrace(); 70 } catch (IOException e) { 71 e.printStackTrace(); 72 } 73 74 return "http://"+IP+":"+PORT+"//uploadFile/fileImage/"+strRanString+suggestedName; 75 // return imagepath +strRanString+suggestedName; 76 } 77 }); 78 79 //analysis word file 80 wordToHtmlConverter.processDocument(wordDocument); 81 Document htmlDocument = wordToHtmlConverter.getDocument(); 82 83 File htmlFile = new File(filepath +strRanString+htmlName); 84 OutputStream outStream = new FileOutputStream(htmlFile); 85 86 87 DOMSource domSource = new DOMSource(htmlDocument); 88 StreamResult streamResult = new StreamResult(outStream); 89 90 TransformerFactory factory = TransformerFactory.newInstance(); 91 Transformer serializer = factory.newTransformer(); 92 serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); 93 serializer.setOutputProperty(OutputKeys.INDENT, "yes"); 94 serializer.setOutputProperty(OutputKeys.METHOD, "html"); 95 96 serializer.transform(domSource, streamResult); 97 outStream.close(); 98 99 System.out.println("generate html File path:"+ "http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName); 100 } 101 102 /** 103 * 2007 Version word to html 104 * @throws IOException 105 */ 106 public void Word2007ToHtml(String fileName) throws IOException { 107 108 final String strRanString=getRandomNum(); 109 110 String filepath = STORAGEPATH+strRanString; 111 String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2007.html"; 112 File f = new File(STORAGEPATH+fileName); 113 if (!f.exists()) { 114 System.out.println("Sorry File does not Exists!"); 115 } else { 116 if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) { 117 try { 118 // 1) load word Document generation XWPFDocument object 119 InputStream in = new FileInputStream(f); 120 XWPFDocument document = new XWPFDocument(in); 121 122 // 2) analysis XHTML To configure (Set here IURIResolver To set the directory where pictures are stored) 123 File imageFolderFile = new File(filepath); 124 XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile)); 125 options.setExtractor(new FileImageExtractor(imageFolderFile)); 126 options.URIResolver(new IURIResolver() { 127 public String resolve(String uri) { 128 //http://192.168.30.222:8010//uploadFile/.... 129 return "http://"+IP+":"+PORT+"//uploadFile/"+strRanString +"/"+ uri; 130 } 131 }); 132 133 options.setIgnoreStylesIfUnused(false); 134 options.setFragment(true); 135 136 // 3) take XWPFDocument convert to XHTML 137 OutputStream out = new FileOutputStream(new File(filepath + htmlName)); 138 IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance(); 139 converter.convert(document,out, options); 140 //XHTMLConverter.getInstance().convert(document, out, options); 141 System.out.println("html route:"+"http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName); 142 } catch (Exception e) { 143 e.printStackTrace(); 144 } 145 146 } else { 147 System.out.println("Enter only MS Office 2007+ files"); 148 } 149 } 150 } 151 152 /** 153 *Function Description: generate time stamp 154 *Created by: zsq 155 *Creation time: 2:37:09 PM, December 7, 2019 156 * 157 */ 158 public static String getRandomNum(){ 159 Date dt = new Date(); 160 SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss"); 161 String str=sdf.format(dt); 162 return str; 163 } 164 165 }
2: Java implementation converts Pdf to html
1: introduce dependency
1 <dependency> 2 <groupId>net.sf.cssbox</groupId> 3 <artifactId>pdf2dom</artifactId> 4 <version>1.7</version> 5 </dependency> 6 <dependency> 7 <groupId>org.apache.pdfbox</groupId> 8 <artifactId>pdfbox</artifactId> 9 <version>2.0.12</version> 10 </dependency> 11 <dependency> 12 <groupId>org.apache.pdfbox</groupId> 13 <artifactId>pdfbox-tools</artifactId> 14 <version>2.0.12</version> 15 </dependency> 16
2: Code Demo
1 public class PdfToHtml { 2 3 /* 4 pdf Transform html 5 */ 6 public void pdfToHtmlTest(String inPdfPath,String outputHtmlPath) { 7 // String outputPath = "C:\\works\\files\\ZSQ Secret knowledge test question bank.html"; 8 9 //try() Written in()It will automatically shut down the flow 10 try{ 11 BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)),"utf-8")); 12 //load PDF file 13 //PDDocument document = PDDocument.load(bytes); 14 PDDocument document = PDDocument.load(new File(inPdfPath)); 15 PDFDomTree pdfDomTree = new PDFDomTree(); 16 pdfDomTree.writeText(document,out); 17 } catch (Exception e) { 18 e.printStackTrace(); 19 } 20 } 21 22 public static void main(String[] args) throws IOException { 23 PdfToHtml ph=new PdfToHtml(); 24 String pdfPath="C:\\works\\files\\Administrative attendance system of Wuhan Research Center.pdf"; 25 String outputPath="C:\\works\\files\\Administrative attendance system of Wuhan Research Center.html"; 26 ph.pdfToHtmlTest(pdfPath,outputPath); 27 } 28 29 }
3: Java implementation converts TXT to html
1 /* 2 * txt Document to html 3 filePath:txt Original file path 4 htmlPosition:html path generated after conversion 5 */ 6 public static void txtToHtml(String filePath, String htmlPosition) { 7 try { 8 //String encoding = "GBK"; 9 File file = new File(filePath); 10 if (file.isFile() && file.exists()) { // Judge whether the file exists 11 InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK"); 12 // Considering the encoding format 13 BufferedReader bufferedReader = new BufferedReader(read); 14 // Write file 15 FileOutputStream fos = new FileOutputStream(new File(htmlPosition)); 16 OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK"); 17 BufferedWriter bw = new BufferedWriter(osw); 18 String lineTxt = null; 19 while ((lineTxt = bufferedReader.readLine()) != null) { 20 bw.write("   "+lineTxt + "</br>"); 21 } 22 bw.close(); 23 osw.close(); 24 fos.close(); 25 read.close(); 26 } else { 27 System.out.println("The specified file could not be found"); 28 } 29 } catch (Exception e) { 30 System.out.println("Error reading file content"); 31 e.printStackTrace(); 32 } 33 }