提交 96f0d035 authored 作者: 任建彩's avatar 任建彩

配置

上级 c222799b
......@@ -15,9 +15,19 @@
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<repositories>
<repository>
<id>com.e-iceblue</id>
<url>http://repo.e-iceblue.cn/repository/maven-public/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>e-iceblue</groupId>
<artifactId>spire.doc</artifactId>
<version>10.12.4</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
......
package com.zrqx.file.controller.backend;
import com.spire.doc.Document;
import com.spire.doc.documents.Paragraph;
import com.zrqx.core.response.CallBack;
import com.zrqx.file.model.po.FileInfo;
import com.zrqx.file.service.FileService;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
/**
* xml
*
*/
@RestController
@RequestMapping("/word")
@Api(tags = "word解析")
public class WordController {
@Autowired
private FileService service;
@Value("${file-root-path}")
private String rootPath;
/**
* 读取本地html文件里的html代码
* @return
*/
@ApiOperation(value = "解析文件目录")
@RequestMapping(value = "/upload", method= RequestMethod.POST)
public CallBack<List<String>> searchWordDocX(@RequestParam("fileFolder") MultipartFile file) throws IOException {
FileInfo entity = service.uploadFile(file);
String fileName = entity.getOriginalFileName();
String suff = fileName.substring(fileName.lastIndexOf(".") + 1);
String content = "";
String filePath = null;
filePath = "D:"+rootPath+entity.getPath()+"/"+entity.getFileName()+entity.getSuffixName();
List<String> list = new ArrayList<>();
if (suff.equals("docx")) {
list= fileWord(filePath);
}
if (suff.equals("html")) {
list= fileHtml(filePath);
}
return CallBack.success(list);
}
public List<String> fileWord(String filePath){
String content = "";
List<String> list = new ArrayList<>();
Document doc = new Document();
doc.loadFromFile(filePath);
//获取段落数量
int count = doc.getSections().get(0).getParagraphs().getCount();
System.out.println("总共含有段落数:" + count);
//获取段落
int i=0;
int j=0;
for (int z = 0; z < count; z++) {
Paragraph paragraph = doc.getSections().get(0).getParagraphs().get(z);
/*if(StringUtils.isNotBlank(paragraph.getText()) && z==2){
System.out.println("获取文章标题段落:<title>"+paragraph.getText()+"</title>");
content="获取文章标题段落:<title>"+paragraph.getText()+"</title>";
list.add(content);
}*/
if(StringUtils.isNotBlank(paragraph.getText())){
//获取子段落
String text = paragraph.getText();
String substring = text.substring(0, 1);
if(substring.contains("(")){
int mid = z - 1;
System.out.println("获取二级段落:<p id='p"+j+"' pid='"+i+"'>"+paragraph.getText()+"</p>");
content="获取二级段落:<p id='p"+j+"' pid='"+i+"'>"+paragraph.getText()+"</p>";
list.add(content);
}else{
i++;
System.out.println("获取一级段落:<p id='"+i+"'>"+paragraph.getText()+"</p>");
content="获取一级段落:<p id='"+i+"'>"+paragraph.getText()+"</p>";
list.add(content);
}
}
}
return list;
}
public List<String> fileHtml(String filePath) throws IOException {
// filePath = "D:/opt/upload/rsgw/2023/03/21/28622e72956e49a4ae2b6b79ea871c6a.html";
String content = "";
List<String> list = new ArrayList<>();
File file = new File(filePath);
org.jsoup.nodes.Document doc = Jsoup.parse(file, "UTF-8");
//读取.html文件为字符串
Elements es12 = doc.getElementsByClass("_3ygOc lg-fl ");
int i=0;
int j=0;
int k=0;
for (Element d:es12) {
//Elements p =d.getElementsByTag("p");
Elements span =d.getElementsByTag("span");
Elements strong =d.getElementsByTag("strong");
if(span.size()>0){
for (Element el1:span) {
//获取a标签下的文本内容
String text = el1.text();
if(StringUtils.isNotBlank(text)) {
i++;
System.out.println("获取一级段落<p id='"+i+"'>" + text+"</p>");
content="获取一级段落<p id='"+i+"'>" + text+"</p>";
list.add(content);
}
}
}else{
if(strong.size()>0){
for (Element el1:strong) {
//获取a标签下的文本内容
String text = el1.text();
if(StringUtils.isNotBlank(text)) {
i++;
System.out.println("获取一级段落<p id='"+i+"'>" + text+"</p>");
content="获取一级段落<p id='"+i+"'>" + text+"</p>";
list.add(content);
}
}
}else{
Elements p =d.getElementsByTag("p");
if(p.size()>0){
for (Element el1:p) {
//获取p标签下的文本内容
String text = el1.text();
if(StringUtils.isNotBlank(text)){
String substring = text.substring(0, 1);
String pattern="^.*[0-9]+.*$";
String ze="[\\u4e00-\\u9fa5]";
boolean isMatch = Pattern.matches(pattern, substring);
boolean hanzi = Pattern.matches(ze, substring);
if(substring.contains("(")){
j++;
System.out.println("获取二级段落:<p id='p"+j+"' pid='"+i+"'>" + text+"</p>");
content="获取二级段落:<p id='"+j+"' pid='"+i+"'>" + text+"</p>";
list.add(content);
}else if(isMatch) {
k++;
System.out.println("获取三级段落:<p id='"+k+"' pid='"+j+"'>" + text+"</p>");
content="获取三级段落:<p id='"+k+"' pid='"+j+"'>" + text+"</p>";
list.add(content);
}else if(hanzi){
i++;
System.out.println("获取一级段落::<p id='"+i+"''>" + text+"</p>");
content="获取一级段落:<p id='"+i+"'>" + text+"</p>";
list.add(content);
}
}
}
}
}
}
}
return list;
}
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论