引用依赖
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.31</version>
</dependency>
关键字坐标查询
注:其中y坐标需要获取 startPosition.getEndY(),同时实现了换行与换页的关键字查询
package cn.byzk.knowledgecore;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.util.ObjectUtils;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @author 会飞的大象2024-07-03
*/
public class PDFKeywordLocator extends PDFTextStripper {
private List<KeywordPosition> keywordPositions = new ArrayList<>();
private String keyword;
private KeywordPosition tmpTextPosition;
private String splitText;
public PDFKeywordLocator(String keyword) throws IOException {
this.keyword = keyword;
}
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
string = string.replace(" ", "");
if (!ObjectUtils.isEmpty(splitText) && !ObjectUtils.isEmpty(string)) {
if (string.startsWith(keyword.substring(splitText.length()))){
this.keywordPositions.add(this.tmpTextPosition);
}else {
this.tmpTextPosition=null;
this.splitText=null;
}
}
for (int i = 0; i <= string.length() - keyword.length(); i++) {
if (string.substring(i, i + keyword.length()).equals(keyword)) {
TextPosition startPosition = textPositions.get(i);
float x = startPosition.getXDirAdj();
float y = startPosition.getEndY();
int pageNo = this.getCurrentPageNo();
this.keywordPositions.add(new KeywordPosition(pageNo, x, y));
}
}
if (!ObjectUtils.isEmpty(string))
{
//去掉最后一个,如果是的完整的画,上面已经检索到了.
for (int i = 0; i < keyword.length()-1; i++) {
String endData = keyword.substring(0, i + 1);
if (string.endsWith(endData)) {
splitText = endData;
TextPosition startPosition = textPositions.get(textPositions.size()-endData.length());
float x = startPosition.getXDirAdj();
float y = startPosition.getEndY();
int pageNo = this.getCurrentPageNo();
this.tmpTextPosition=new KeywordPosition(pageNo, x, y);
break;
}
}
}
super.writeString(string, textPositions);
}
public List<KeywordPosition> getKeywordPositions() {
return keywordPositions;
}
public static void main(String[] args) throws IOException {
String filePath = "C:\\Users\\isme\\Desktop\\test\\阿亮疯狂测试.pdf";
String keyword = "亮测试";
try (PDDocument document = PDDocument.load(new File(filePath))) {
PDFKeywordLocator locator = new PDFKeywordLocator(keyword);
locator.setSortByPosition(true);
locator.setStartPage(0);
locator.setEndPage(document.getNumberOfPages());
locator.getText(document);
List<KeywordPosition> positions = locator.getKeywordPositions();
for (KeywordPosition position : positions) {
System.out.println("Keyword found at page:" + position.getPageNo() + ",x: " + position.getX() + ", y: " + position.getY());
}
}
}
}
class KeywordPosition {
private final float x;
private final float y;
private final int pageNo;
public KeywordPosition(int pageNo, float x, float y) {
this.x = x;
this.y = y;
this.pageNo = pageNo;
}
public float getX() {
return x;
}
public float getY() {
return y;
}
public int getPageNo() {
return pageNo;
}
}
关键字高亮显示
注:其中计算高亮区域存在坑,需要自己计算与测试(当前版本已解决),同时实现了换行与换页的关键字高亮
package cn.byzk.knowledgecore;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDBorderStyleDictionary;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.util.ObjectUtils;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @author 会飞的大象2024-07-03
*/
public class PDFKeywordHighlighter extends PDFTextStripper {
private final List<KeywordPositionColour> keywordPositions = new ArrayList<>();
private final String keyword;
private List<TextPosition> tmpTextPositionList;
private String splitText;
private Integer tmpPageNo;
public PDFKeywordHighlighter(String keyword) throws IOException {
this.keyword = keyword;
}
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
int endSpacesNum = 0;
int spacesNum = 0;
boolean isStart = true;
if (string.contains(" ")) {
String s1 = new StringBuilder(string).reverse().toString();
char[] charArray = s1.toCharArray();
for (char c : charArray) {
//32为空格.
if ((ObjectUtils.isEmpty(c) || 32 == c) ) {
if (isStart)
{
endSpacesNum++;
}else {
spacesNum++;
}
} else {
isStart = false;
}
}
}
string = string.replace(" ", "");
if (!ObjectUtils.isEmpty(splitText) && !ObjectUtils.isEmpty(string)) {
if (string.startsWith(keyword.substring(splitText.length()))) {
List<TextPosition> list = textPositions.subList(0, keyword.length() - splitText.length());
this.keywordPositions.add(new KeywordPositionColour(this.tmpTextPositionList, tmpPageNo));
this.keywordPositions.add(new KeywordPositionColour(list, this.getCurrentPageNo()));
} else {
this.tmpTextPositionList = null;
this.splitText = null;
this.tmpPageNo = null;
}
}
for (int i = 0; i <= string.length() - keyword.length(); i++) {
if (string.substring(i, i + keyword.length()).equals(keyword)) {
List<TextPosition> subList =textPositions.subList(textPositions.size()-keyword.length()-endSpacesNum,
textPositions.size()-endSpacesNum);
keywordPositions.add(new KeywordPositionColour(subList, this.getCurrentPageNo()));
}
}
//去掉最后一个,如果是的完整的画,上面已经检索到了.
if (!ObjectUtils.isEmpty(string)) {
for (int i = 0; i < keyword.length() - 1; i++) {
String endData = keyword.substring(0, i + 1);
if (string.endsWith(endData)) {
this.splitText = endData;
this.tmpTextPositionList = textPositions.subList(textPositions.size() - endData.length() - endSpacesNum,
textPositions.size());
this.tmpPageNo = this.getCurrentPageNo();
break;
}
}
}
super.writeString(string, textPositions);
}
public List<KeywordPositionColour> getKeywordPositions() {
return keywordPositions;
}
public static void main(String[] args) throws IOException {
// String filePath = "C:\\Users\\isme\\Desktop\\test\\安全芯片密码检测准则.pdf";
// String keyword = ",送检单位应予";
// String outputFilePath = "C:\\Users\\isme\\Desktop\\test\\安全芯片密码检测准则1.pdf";
String filePath = "C:\\Users\\isme\\Desktop\\test\\阿亮疯狂测试.pdf";
String keyword = "亮测试";
String outputFilePath = "C:\\Users\\isme\\Desktop\\test\\阿亮疯狂测试1.pdf";
try (PDDocument document = PDDocument.load(new File(filePath))) {
PDFKeywordHighlighter highlighter = new PDFKeywordHighlighter(keyword);
highlighter.setSortByPosition(true);
highlighter.setStartPage(0);
highlighter.setEndPage(document.getNumberOfPages());
highlighter.getText(document);
for (KeywordPositionColour position : highlighter.getKeywordPositions()) {
PDPage page = document.getPage(position.getPageNumber() - 1);
PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
txtMark.setRectangle(position.getRectangle());
txtMark.setQuadPoints(position.getQuadPoints());
//txtMark.setColor(new Color(255, 255, 0));
txtMark.setColor(new PDColor(new float[]{255, 255, 0}, PDDeviceRGB.INSTANCE));
PDBorderStyleDictionary border = new PDBorderStyleDictionary();
border.setWidth(0);
txtMark.setBorderStyle(border);
page.getAnnotations().add(txtMark);
}
document.save(outputFilePath);
}
}
}
class KeywordPositionColour {
private final List<TextPosition> positions;
private final int pageNo;
public KeywordPositionColour(List<TextPosition> positions, int pageNo) {
this.positions = positions;
this.pageNo = pageNo;
}
public int getPageNumber() {
return this.pageNo;
}
public float[] getQuadPoints() {
TextPosition first = positions.get(0);
TextPosition last = positions.get(positions.size() - 1);
//高亮区域四个角的坐标(左下角、右下角、左上角、右上角)
float[] quadPoints = new float[8];
quadPoints[0] = first.getXDirAdj();
quadPoints[1] = first.getEndY() - first.getHeightDir() * 0.2f;
quadPoints[2] = (last.getXDirAdj() + last.getWidthDirAdj());
quadPoints[3] = first.getEndY() - first.getHeightDir() * 0.2f;
quadPoints[4] = first.getXDirAdj();
quadPoints[5] = first.getEndY() + 1.6f * first.getHeightDir();
quadPoints[6] = last.getXDirAdj() + last.getWidthDirAdj();
quadPoints[7] = first.getEndY() + 1.6f * first.getHeightDir();
return quadPoints;
}
public PDRectangle getRectangle() {
TextPosition first = positions.get(0);
TextPosition last = positions.get(positions.size() - 1);
float x = first.getXDirAdj();
float y = first.getEndY();
float width = last.getXDirAdj() + last.getWidthDirAdj() - x;
//float height = first.getHeightDir();
float height = first.getHeightDir();
return new PDRectangle(x, y - height, width, height);
}
}
评论区