企业级-生成PDF移除异常空白页

发布于:2024-07-01 ⋅ 阅读:(19) ⋅ 点赞:(0)

作者:fyupeng
技术专栏:☞ https://github.com/fyupeng
项目地址:☞ https://github.com/fyupeng/distributed-blog-system-api


留给读者

咱们又见面了,本期带给大家什么,请往下看,绝对是干货!

一、介绍

提供 PDF文件二进制参数,返回删除空白页的PDF文件二进制。

二、代码

引入依赖:

<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.21</version>
</dependency>

代码:

public static void main(String[] args) throws IOException {
        File file = new File("d:/hztzs.pdf");
        byte[] bytes = new byte[(int) file.length()];
        FileInputStream fis = new FileInputStream(file);
        fis.read(bytes);

        bytes = new ArchivElecFileService().removeEmptyPages(bytes);

        File newfile = new File("d:/out.pdf");
        FileOutputStream fos = new FileOutputStream(newfile);
        fos.write(bytes);
    }

public byte[] removeEmptyPages(byte[] fileBytes) throws IOException {
        // Load the PDF document
        PDDocument document = PDDocument.load(fileBytes);

        // Iterate through each page
        PDPageTree pages = document.getPages();
        int pageCount = document.getNumberOfPages();
        for (int i = pageCount - 1; i >= 0; i--) {
            // Extract text from the page
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setStartPage(i + 1); // Page indexes are 1-based in PDFTextStripper
            stripper.setEndPage(i + 1);
            String text = stripper.getText(document);

            PDPage page = pages.get(i);

            // Check if the page is empty
            if (text.trim().isEmpty()) {
                // Remove the page
                if (isPageImageOnly(page, document)) {
                    document.removePage(i);
                }
            }
        }
        // 保存结果文件
        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
        document.save(outputStream);

        return outputStream.toByteArray();
    }

    private static boolean isPageImageOnly(PDPage page, PDDocument document) throws IOException {
        PDFRenderer renderer = new PDFRenderer(document);
        BufferedImage image = renderer.renderImageWithDPI(document.getPages().indexOf(page), 300); // Adjust DPI as needed
        return isImageOnly(image);
    }

    private static boolean isImageOnly(BufferedImage image) {
        // Check if the image contains significant content (e.g., not just white)
        // You can implement custom logic based on your requirements
        // For simplicity, here's a basic check
        int width = image.getWidth();
        int height = image.getHeight();
        long whitePixelCount = ImageUtils.countWhitePixels(image);

        // If more than 90% of the image is white, consider it empty
        double whiteRatio = (double) whitePixelCount / (width * height);
        return whiteRatio > 0.95; // Adjust threshold as needed
    }

    // Utility class to count white pixels in an image
    static class ImageUtils {
        public static long countWhitePixels(BufferedImage image) {
            long count = 0;
            int width = image.getWidth();
            int height = image.getHeight();
            for (int y = 0; y < height; y++) {
                for (int x = 0; x < width; x++) {
                    int pixel = image.getRGB(x, y);
                    if (isWhite(pixel)) {
                        count++;
                    }
                }
            }
            return count;
        }
        private static boolean isWhite(int pixel) {
            // Define your white color threshold based on RGB values
            // Adjust as per your image characteristics
            int red = (pixel >> 16) & 0xff;
            int green = (pixel >> 8) & 0xff;
            int blue = (pixel) & 0xff;
            return red > 250 && green > 250 && blue > 250;
        }
    }

三、总结

易用、高效、轻便!