P15GEN2\59518
2025-10-10 9f6890646993d16260d4201d613c092132856127
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
package ai;
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
 
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdmodel.PDDocument;
 
import com.aliyun.ocr20191230.Client;
import com.aliyun.ocr20191230.models.RecognizePdfResponseBody;
import com.aliyun.tea.TeaException;
 
import foundation.workflow.ActionProvider;
 
public class AiHandler extends ActionProvider {
 
    @Override
    protected void publishMethod() {
        addMethod("pdf");
    }
    
    
    public void pdf() throws Exception {
        Client client = createClient();
        String filePath = dataReader.getString("filePath");
        File file = new File(filePath);
        PDDocument document = PDDocument.load(file);
        int numberOfPages = document.getNumberOfPages();
 
 
        List<File> fileList = new ArrayList<>();
        if (numberOfPages > 5) {
            fileList.addAll(splitFiles(document));
        } else {
            fileList.add(file);
        }
        document.close();
 
        for (int i = 0; i < fileList.size(); i++) {
            File oneSubFile = fileList.get(i);
            logger.info("总共:{} 开始读取第{}个 文件名:{} ", fileList.size(), i+1, oneSubFile.getName());
            getPDFText(client, oneSubFile);
        }
 
    }
 
    private List<File> splitFiles(PDDocument document) {
        List<File> fileList = new ArrayList<>();
        //1 创建拆分器并设置每5页拆分一次
        Splitter splitter = new Splitter();
        splitter.setSplitAtPage(5); // 关键参数设置
        // 3. 执行拆分操作
        try {
            List<PDDocument> splitDocuments = splitter.split(document);
 
            // 4. 保存拆分后的文件
            String outputDir = "output/"; // 输出目录
            new File(outputDir).mkdirs(); // 创建目录
 
            for (int i = 0; i < splitDocuments.size(); i++) {
                String outputPath = outputDir + "split_" + (i + 1) + ".pdf";
                splitDocuments.get(i).save(outputPath);
                splitDocuments.get(i).close();
                File file = new File(outputPath);
                fileList.add(file);
                System.out.println("生成文件: " + outputPath);
            }
 
            System.out.println("拆分完成,共生成" + splitDocuments.size() + "个文件");
 
        } catch (IOException e) {
            e.printStackTrace();
        }
        return fileList;
    }
 
    private void getPDFText(Client client, File oneSubFile) throws FileNotFoundException {
        com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest recognizePdfRequest = new com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest();
        InputStream inputStream = new FileInputStream(oneSubFile);
        recognizePdfRequest.setFileURLObject(inputStream);
        com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions();
        try {
            com.aliyun.ocr20191230.models.RecognizePdfResponse resp = client.recognizePdfAdvance(recognizePdfRequest, runtime);
            RecognizePdfResponseBody body = resp.getBody();
            RecognizePdfResponseBody.RecognizePdfResponseBodyData data = body.getData();
            List<RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo> wordsInfo = data.getWordsInfo();
            for (RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo recognizePdfResponseBodyDataWordsInfo : wordsInfo) {
                String word = recognizePdfResponseBodyDataWordsInfo.word;
                logger.info("文字:{}", word);
            }
 
 
            com.aliyun.teaconsole.Client.log(com.aliyun.teautil.Common.toJSONString(resp));
        } catch (TeaException error) {
            error.printStackTrace();
            logger.info(error.getMessage());
            // 此处仅做打印展示,请谨慎对待异常处理,在工程项目中切勿直接忽略异常。
            // 错误 message
            System.out.println(error.getMessage());
            // 诊断地址
            System.out.println(error.getData().get("Recommend"));
            com.aliyun.teautil.Common.assertAsString(error.message);
        } catch (Exception _error) {
            TeaException error = new TeaException(_error.getMessage(), _error);
            // 此处仅做打印展示,请谨慎对待异常处理,在工程项目中切勿直接忽略异常。
            // 错误 message
            System.out.println(error.getMessage());
            // 诊断地址
            System.out.println(error.getData().get("Recommend"));
            com.aliyun.teautil.Common.assertAsString(error.message);
        }
    }
    
    
    
      public static com.aliyun.ocr20191230.Client createClient() throws Exception {
            // 工程代码建议使用更安全的无AK方式,凭据配置方式请参见:https://help.aliyun.com/document_detail/378657.html。
            com.aliyun.teaopenapi.models.Config config = new com.aliyun.teaopenapi.models.Config()
                    .setAccessKeyId("LTAI5tCSkZYYhkUCsk4v4CCu")
                    .setAccessKeySecret("vhJBGvKQKmKFIpUq6WQndYYMwwRaP7");
            // Endpoint 请参考 https://api.aliyun.com/product/ocr
            config.endpoint = "ocr.cn-shanghai.aliyuncs.com";
            return new com.aliyun.ocr20191230.Client(config);
        }
    
}