P15GEN2\59518
2025-10-18 56638c01bb2cc61a92f5e03c9a1001be5b5d3699
ai/src/ai/AiHandler.java
@@ -1,20 +1,19 @@
package ai;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.nio.charset.StandardCharsets;
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import com.aliyun.ocr20191230.Client;
import com.aliyun.ocr20191230.models.RecognizePdfResponseBody;
import com.aliyun.tea.TeaException;
import foundation.dao.DataPackage;
import foundation.data.entity.Entity;
import foundation.icall.ICall;
import foundation.icall.ICallBucket;
import foundation.icall.ICallCenter;
import foundation.util.Util;
import foundation.workflow.ActionProvider;
public class AiHandler extends ActionProvider {
@@ -22,110 +21,74 @@
   @Override
   protected void publishMethod() {
      addMethod("pdf");
      addMethod("writerResult");
   }
   
   public void pdf() throws Exception {
      Client client = createClient();
        String filePath = dataReader.getString("filePath");
        File file = new File(filePath);
        PDDocument document = PDDocument.load(file);
        int numberOfPages = document.getNumberOfPages();
        List<File> fileList = new ArrayList<>();
        if (numberOfPages > 5) {
            fileList.addAll(splitFiles(document));
        } else {
            fileList.add(file);
        }
        document.close();
        for (int i = 0; i < fileList.size(); i++) {
            File oneSubFile = fileList.get(i);
            logger.info("总共:{} 开始读取第{}个 文件名:{} ", fileList.size(), i+1, oneSubFile.getName());
            getPDFText(client, oneSubFile);
        }
       AliyunOcrApiDirect ocr = new AliyunOcrApiDirect();
       DataPackage dataPackage = dataReader.getDataPackage();
       dataPackage.loadOneDataFromDB();
       Entity master = dataPackage.getMasterEntity();
       String fileUrl = master.getString("file_url");
       // PDF识别
       String result = ocr.recognizePdf(fileUrl, 1);
       System.out.println("PDF识别结果: " + result);
   }
   private List<File> splitFiles(PDDocument document) {
        List<File> fileList = new ArrayList<>();
        //1 创建拆分器并设置每5页拆分一次
        Splitter splitter = new Splitter();
        splitter.setSplitAtPage(5); // 关键参数设置
        // 3. 执行拆分操作
        try {
            List<PDDocument> splitDocuments = splitter.split(document);
            // 4. 保存拆分后的文件
            String outputDir = "output/"; // 输出目录
            new File(outputDir).mkdirs(); // 创建目录
            for (int i = 0; i < splitDocuments.size(); i++) {
                String outputPath = outputDir + "split_" + (i + 1) + ".pdf";
                splitDocuments.get(i).save(outputPath);
                splitDocuments.get(i).close();
                File file = new File(outputPath);
                fileList.add(file);
                System.out.println("生成文件: " + outputPath);
   public void writerResult() throws Exception {
      DataPackage dataPackage = dataReader.getDataPackage();
      dataPackage.loadOneDataFromDB();
      Entity master = dataPackage.getMasterEntity();
      int index = 0;
      String baiduFileUrl = master.getString("baidu_file_url");
      ICallCenter icallCenter = ICallCenter.getInstance();
      ICallBucket callBucket = ICallBucket.getInstance();
      ICall iCall = callBucket.getOne("document-parser-quary");
      while (Util.isEmpty(baiduFileUrl) && index < 3) {
         step.setDataPackage(dataPackage);
         icallCenter.callRemote(step, iCall);
         dataPackage.loadOneDataFromDB(true);
         master = dataPackage.getMasterEntity();
         index ++;
         Thread.sleep(2000);
         baiduFileUrl = master.getString("baidu_file_url");
      }
      String jsonContent = fetchJsonWithHttpClient(baiduFileUrl);
      AIResult result = new AIResult(jsonContent);
      dataWriter.addValue("content", result);
      dataWriter.addValue("data", master);
   }
   public static String fetchJsonWithHttpClient(String url) {
        try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
            HttpGet request = new HttpGet(url);
            // 设置请求头
            request.setHeader("Accept", "application/json; charset=UTF-8");
            request.setHeader("Accept-Charset", "UTF-8");
            request.setHeader("User-Agent", "Mozilla/5.0");
            try (CloseableHttpResponse response = httpClient.execute(request)) {
                int statusCode = response.getCode();
                if (statusCode == 200) {
                    return EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8);
                } else {
                    System.err.println("请求失败,状态码: " + statusCode);
                    return null;
                }
            }
            System.out.println("拆分完成,共生成" + splitDocuments.size() + "个文件");
        } catch (IOException e) {
        } catch (Exception e) {
            e.printStackTrace();
        }
        return fileList;
    }
    private void getPDFText(Client client, File oneSubFile) throws FileNotFoundException {
        com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest recognizePdfRequest = new com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest();
        InputStream inputStream = new FileInputStream(oneSubFile);
        recognizePdfRequest.setFileURLObject(inputStream);
        com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions();
        try {
            com.aliyun.ocr20191230.models.RecognizePdfResponse resp = client.recognizePdfAdvance(recognizePdfRequest, runtime);
            RecognizePdfResponseBody body = resp.getBody();
            RecognizePdfResponseBody.RecognizePdfResponseBodyData data = body.getData();
            List<RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo> wordsInfo = data.getWordsInfo();
            for (RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo recognizePdfResponseBodyDataWordsInfo : wordsInfo) {
                String word = recognizePdfResponseBodyDataWordsInfo.word;
                logger.info("文字:{}", word);
            }
            com.aliyun.teaconsole.Client.log(com.aliyun.teautil.Common.toJSONString(resp));
        } catch (TeaException error) {
            error.printStackTrace();
            logger.info(error.getMessage());
            // 此处仅做打印展示,请谨慎对待异常处理,在工程项目中切勿直接忽略异常。
            // 错误 message
            System.out.println(error.getMessage());
            // 诊断地址
            System.out.println(error.getData().get("Recommend"));
            com.aliyun.teautil.Common.assertAsString(error.message);
        } catch (Exception _error) {
            TeaException error = new TeaException(_error.getMessage(), _error);
            // 此处仅做打印展示,请谨慎对待异常处理,在工程项目中切勿直接忽略异常。
            // 错误 message
            System.out.println(error.getMessage());
            // 诊断地址
            System.out.println(error.getData().get("Recommend"));
            com.aliyun.teautil.Common.assertAsString(error.message);
            return null;
        }
    }
     public static com.aliyun.ocr20191230.Client createClient() throws Exception {
           // 工程代码建议使用更安全的无AK方式,凭据配置方式请参见:https://help.aliyun.com/document_detail/378657.html。
           com.aliyun.teaopenapi.models.Config config = new com.aliyun.teaopenapi.models.Config()
                   .setAccessKeyId("LTAI5tCSkZYYhkUCsk4v4CCu")
                   .setAccessKeySecret("vhJBGvKQKmKFIpUq6WQndYYMwwRaP7");
           // Endpoint 请参考 https://api.aliyun.com/product/ocr
           config.endpoint = "ocr.cn-shanghai.aliyuncs.com";
           return new com.aliyun.ocr20191230.Client(config);
       }
}