侧边栏壁纸
博主头像
会飞的大象博主等级

爱运动的程序猿

  • 累计撰写 126 篇文章
  • 累计创建 158 个标签
  • 累计收到 0 条评论
标签搜索

目 录CONTENT

文章目录

pdf关键字坐标查询与高亮展示

会飞的大象
2024-07-03 / 0 评论 / 0 点赞 / 336 阅读 / 1,283 字

引用依赖

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.31</version>
        </dependency>

关键字坐标查询

注:其中y坐标需要获取 startPosition.getEndY(),同时实现了换行与换页的关键字查询

package cn.byzk.knowledgecore;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.util.ObjectUtils;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @author 会飞的大象2024-07-03
 */
public class PDFKeywordLocator extends PDFTextStripper {

    private List<KeywordPosition> keywordPositions = new ArrayList<>();
    private String keyword;
    private KeywordPosition tmpTextPosition;
    private String splitText;

    public PDFKeywordLocator(String keyword) throws IOException {
        this.keyword = keyword;
    }

    @Override
    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
        string = string.replace(" ", "");
        if (!ObjectUtils.isEmpty(splitText) && !ObjectUtils.isEmpty(string)) {
            if (string.startsWith(keyword.substring(splitText.length()))){
                this.keywordPositions.add(this.tmpTextPosition);
            }else {
                this.tmpTextPosition=null;
                this.splitText=null;
            }
        }

        for (int i = 0; i <= string.length() - keyword.length(); i++) {
            if (string.substring(i, i + keyword.length()).equals(keyword)) {
                TextPosition startPosition = textPositions.get(i);
                float x = startPosition.getXDirAdj();
                float y = startPosition.getEndY();
                int pageNo = this.getCurrentPageNo();
                this.keywordPositions.add(new KeywordPosition(pageNo, x, y));
            }
        }
        if (!ObjectUtils.isEmpty(string))
        {
            //去掉最后一个,如果是的完整的画,上面已经检索到了.
            for (int i = 0; i < keyword.length()-1; i++) {
                String endData = keyword.substring(0, i + 1);
                if (string.endsWith(endData)) {
                    splitText = endData;
                    TextPosition startPosition = textPositions.get(textPositions.size()-endData.length());
                    float x = startPosition.getXDirAdj();
                    float y = startPosition.getEndY();
                    int pageNo = this.getCurrentPageNo();
                    this.tmpTextPosition=new KeywordPosition(pageNo, x, y);
                    break;
                }
            }
        }
        super.writeString(string, textPositions);
    }

    public List<KeywordPosition> getKeywordPositions() {
        return keywordPositions;
    }

    public static void main(String[] args) throws IOException {
        String filePath = "C:\\Users\\isme\\Desktop\\test\\阿亮疯狂测试.pdf";
        String keyword = "亮测试";

        try (PDDocument document = PDDocument.load(new File(filePath))) {
            PDFKeywordLocator locator = new PDFKeywordLocator(keyword);
            locator.setSortByPosition(true);
            locator.setStartPage(0);
            locator.setEndPage(document.getNumberOfPages());
            locator.getText(document);

            List<KeywordPosition> positions = locator.getKeywordPositions();
            for (KeywordPosition position : positions) {
                System.out.println("Keyword found at page:" + position.getPageNo() + ",x: " + position.getX() + ", y: " + position.getY());
            }
        }
    }
}

class KeywordPosition {
    private final float x;
    private final float y;
    private final int pageNo;

    public KeywordPosition(int pageNo, float x, float y) {
        this.x = x;
        this.y = y;
        this.pageNo = pageNo;
    }

    public float getX() {
        return x;
    }

    public float getY() {
        return y;
    }

    public int getPageNo() {
        return pageNo;
    }
}

关键字高亮显示

注:其中计算高亮区域存在坑,需要自己计算与测试(当前版本已解决),同时实现了换行与换页的关键字高亮

package cn.byzk.knowledgecore;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDBorderStyleDictionary;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.util.ObjectUtils;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @author 会飞的大象2024-07-03
 */
public class PDFKeywordHighlighter extends PDFTextStripper {

    private final List<KeywordPositionColour> keywordPositions = new ArrayList<>();
    private final String keyword;
    private List<TextPosition> tmpTextPositionList;
    private String splitText;
    private Integer tmpPageNo;

    public PDFKeywordHighlighter(String keyword) throws IOException {
        this.keyword = keyword;
    }

    @Override
    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
        int endSpacesNum = 0;
        int spacesNum = 0;
        boolean isStart = true;
        if (string.contains(" ")) {
            String s1 = new StringBuilder(string).reverse().toString();
            char[] charArray = s1.toCharArray();
            for (char c : charArray) {
                //32为空格.
                if ((ObjectUtils.isEmpty(c) || 32 == c) ) {
                    if (isStart)
                    {
                        endSpacesNum++;
                    }else {
                        spacesNum++;
                    }
                } else {
                    isStart = false;
                }
            }
        }

        string = string.replace(" ", "");
        if (!ObjectUtils.isEmpty(splitText) && !ObjectUtils.isEmpty(string)) {
            if (string.startsWith(keyword.substring(splitText.length()))) {
                List<TextPosition> list = textPositions.subList(0, keyword.length() - splitText.length());
                this.keywordPositions.add(new KeywordPositionColour(this.tmpTextPositionList, tmpPageNo));
                this.keywordPositions.add(new KeywordPositionColour(list, this.getCurrentPageNo()));
            } else {
                this.tmpTextPositionList = null;
                this.splitText = null;
                this.tmpPageNo = null;
            }
        }
        for (int i = 0; i <= string.length() - keyword.length(); i++) {
            if (string.substring(i, i + keyword.length()).equals(keyword)) {
                List<TextPosition> subList =textPositions.subList(textPositions.size()-keyword.length()-endSpacesNum,
                        textPositions.size()-endSpacesNum);
                keywordPositions.add(new KeywordPositionColour(subList, this.getCurrentPageNo()));
            }
        }
        //去掉最后一个,如果是的完整的画,上面已经检索到了.
        if (!ObjectUtils.isEmpty(string)) {
            for (int i = 0; i < keyword.length() - 1; i++) {
                String endData = keyword.substring(0, i + 1);
                if (string.endsWith(endData)) {
                    this.splitText = endData;
                    this.tmpTextPositionList = textPositions.subList(textPositions.size() - endData.length() - endSpacesNum,
                            textPositions.size());
                    this.tmpPageNo = this.getCurrentPageNo();
                    break;
                }
            }
        }
        super.writeString(string, textPositions);
    }

    public List<KeywordPositionColour> getKeywordPositions() {
        return keywordPositions;
    }

    public static void main(String[] args) throws IOException {
//        String filePath = "C:\\Users\\isme\\Desktop\\test\\安全芯片密码检测准则.pdf";
//        String keyword = ",送检单位应予";
//        String outputFilePath = "C:\\Users\\isme\\Desktop\\test\\安全芯片密码检测准则1.pdf";
        String filePath = "C:\\Users\\isme\\Desktop\\test\\阿亮疯狂测试.pdf";
        String keyword = "亮测试";
        String outputFilePath = "C:\\Users\\isme\\Desktop\\test\\阿亮疯狂测试1.pdf";

        try (PDDocument document = PDDocument.load(new File(filePath))) {
            PDFKeywordHighlighter highlighter = new PDFKeywordHighlighter(keyword);
            highlighter.setSortByPosition(true);
            highlighter.setStartPage(0);
            highlighter.setEndPage(document.getNumberOfPages());
            highlighter.getText(document);

            for (KeywordPositionColour position : highlighter.getKeywordPositions()) {
                PDPage page = document.getPage(position.getPageNumber() - 1);
                PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
                txtMark.setRectangle(position.getRectangle());
                txtMark.setQuadPoints(position.getQuadPoints());

                //txtMark.setColor(new Color(255, 255, 0));
                txtMark.setColor(new PDColor(new float[]{255, 255, 0}, PDDeviceRGB.INSTANCE));
                PDBorderStyleDictionary border = new PDBorderStyleDictionary();
                border.setWidth(0);
                txtMark.setBorderStyle(border);

                page.getAnnotations().add(txtMark);
            }

            document.save(outputFilePath);
        }
    }
}

class KeywordPositionColour {
    private final List<TextPosition> positions;
    private final int pageNo;

    public KeywordPositionColour(List<TextPosition> positions, int pageNo) {
        this.positions = positions;
        this.pageNo = pageNo;
    }

    public int getPageNumber() {
        return this.pageNo;
    }

    public float[] getQuadPoints() {
        TextPosition first = positions.get(0);
        TextPosition last = positions.get(positions.size() - 1);
        //高亮区域四个角的坐标(左下角、右下角、左上角、右上角)
        float[] quadPoints = new float[8];
        quadPoints[0] = first.getXDirAdj();
        quadPoints[1] = first.getEndY() - first.getHeightDir() * 0.2f;
        quadPoints[2] = (last.getXDirAdj() + last.getWidthDirAdj());
        quadPoints[3] = first.getEndY() - first.getHeightDir() * 0.2f;
        quadPoints[4] = first.getXDirAdj();
        quadPoints[5] = first.getEndY() + 1.6f * first.getHeightDir();
        quadPoints[6] = last.getXDirAdj() + last.getWidthDirAdj();
        quadPoints[7] = first.getEndY() + 1.6f * first.getHeightDir();

        return quadPoints;
    }

    public PDRectangle getRectangle() {
        TextPosition first = positions.get(0);
        TextPosition last = positions.get(positions.size() - 1);

        float x = first.getXDirAdj();
        float y = first.getEndY();
        float width = last.getXDirAdj() + last.getWidthDirAdj() - x;
        //float height = first.getHeightDir();
        float height = first.getHeightDir();

        return new PDRectangle(x, y - height, width, height);
    }
}
0

评论区