| | |
| | | package ai; |
| | | |
| | | import java.io.File; |
| | | import java.io.FileInputStream; |
| | | import java.io.FileNotFoundException; |
| | | import java.io.IOException; |
| | | import java.io.InputStream; |
| | | import java.util.ArrayList; |
| | | import java.util.List; |
| | | import java.nio.charset.StandardCharsets; |
| | | |
| | | import org.apache.pdfbox.multipdf.Splitter; |
| | | import org.apache.pdfbox.pdmodel.PDDocument; |
| | | import org.apache.hc.client5.http.classic.methods.HttpGet; |
| | | import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
| | | import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
| | | import org.apache.hc.client5.http.impl.classic.HttpClients; |
| | | import org.apache.hc.core5.http.io.entity.EntityUtils; |
| | | |
| | | import com.aliyun.ocr20191230.Client; |
| | | import com.aliyun.ocr20191230.models.RecognizePdfResponseBody; |
| | | import com.aliyun.tea.TeaException; |
| | | |
| | | import foundation.dao.DataPackage; |
| | | import foundation.data.entity.Entity; |
| | | import foundation.icall.ICall; |
| | | import foundation.icall.ICallBucket; |
| | | import foundation.icall.ICallCenter; |
| | | import foundation.util.Util; |
| | | import foundation.workflow.ActionProvider; |
| | | |
| | | public class AiHandler extends ActionProvider { |
| | |
| | | @Override |
| | | protected void publishMethod() { |
| | | addMethod("pdf"); |
| | | |
| | | addMethod("writerResult"); |
| | | } |
| | | |
| | | |
| | | public void pdf() throws Exception { |
| | | Client client = createClient(); |
| | | String filePath = dataReader.getString("filePath"); |
| | | File file = new File(filePath); |
| | | PDDocument document = PDDocument.load(file); |
| | | int numberOfPages = document.getNumberOfPages(); |
| | | |
| | | |
| | | List<File> fileList = new ArrayList<>(); |
| | | if (numberOfPages > 5) { |
| | | fileList.addAll(splitFiles(document)); |
| | | } else { |
| | | fileList.add(file); |
| | | } |
| | | document.close(); |
| | | |
| | | for (int i = 0; i < fileList.size(); i++) { |
| | | File oneSubFile = fileList.get(i); |
| | | logger.info("总共:{} 开始读取第{}个 文件名:{} ", fileList.size(), i+1, oneSubFile.getName()); |
| | | getPDFText(client, oneSubFile); |
| | | } |
| | | |
| | | AliyunOcrApiDirect ocr = new AliyunOcrApiDirect(); |
| | | |
| | | DataPackage dataPackage = dataReader.getDataPackage(); |
| | | dataPackage.loadOneDataFromDB(); |
| | | Entity master = dataPackage.getMasterEntity(); |
| | | String fileUrl = master.getString("file_url"); |
| | | // PDF识别 |
| | | String result = ocr.recognizePdf(fileUrl, 1); |
| | | System.out.println("PDF识别结果: " + result); |
| | | } |
| | | |
| | | private List<File> splitFiles(PDDocument document) { |
| | | List<File> fileList = new ArrayList<>(); |
| | | //1 创建拆分器并设置每5页拆分一次 |
| | | Splitter splitter = new Splitter(); |
| | | splitter.setSplitAtPage(5); // 关键参数设置 |
| | | // 3. 执行拆分操作 |
| | | try { |
| | | List<PDDocument> splitDocuments = splitter.split(document); |
| | | |
| | | // 4. 保存拆分后的文件 |
| | | String outputDir = "output/"; // 输出目录 |
| | | new File(outputDir).mkdirs(); // 创建目录 |
| | | |
| | | for (int i = 0; i < splitDocuments.size(); i++) { |
| | | String outputPath = outputDir + "split_" + (i + 1) + ".pdf"; |
| | | splitDocuments.get(i).save(outputPath); |
| | | splitDocuments.get(i).close(); |
| | | File file = new File(outputPath); |
| | | fileList.add(file); |
| | | System.out.println("生成文件: " + outputPath); |
| | | |
| | | public void writerResult() throws Exception { |
| | | DataPackage dataPackage = dataReader.getDataPackage(); |
| | | dataPackage.loadOneDataFromDB(); |
| | | Entity master = dataPackage.getMasterEntity(); |
| | | |
| | | int index = 0; |
| | | |
| | | String baiduFileUrl = master.getString("baidu_file_url"); |
| | | ICallCenter icallCenter = ICallCenter.getInstance(); |
| | | ICallBucket callBucket = ICallBucket.getInstance(); |
| | | ICall iCall = callBucket.getOne("document-parser-quary"); |
| | | |
| | | while (Util.isEmpty(baiduFileUrl) && index < 3) { |
| | | step.setDataPackage(dataPackage); |
| | | icallCenter.callRemote(step, iCall); |
| | | |
| | | dataPackage.loadOneDataFromDB(true); |
| | | master = dataPackage.getMasterEntity(); |
| | | index ++; |
| | | |
| | | Thread.sleep(2000); |
| | | baiduFileUrl = master.getString("baidu_file_url"); |
| | | } |
| | | |
| | | String jsonContent = fetchJsonWithHttpClient(baiduFileUrl); |
| | | AIResult result = new AIResult(jsonContent); |
| | | dataWriter.addValue("content", result); |
| | | dataWriter.addValue("data", master); |
| | | } |
| | | |
| | | public static String fetchJsonWithHttpClient(String url) { |
| | | try (CloseableHttpClient httpClient = HttpClients.createDefault()) { |
| | | HttpGet request = new HttpGet(url); |
| | | |
| | | // 设置请求头 |
| | | request.setHeader("Accept", "application/json; charset=UTF-8"); |
| | | request.setHeader("Accept-Charset", "UTF-8"); |
| | | request.setHeader("User-Agent", "Mozilla/5.0"); |
| | | |
| | | try (CloseableHttpResponse response = httpClient.execute(request)) { |
| | | int statusCode = response.getCode(); |
| | | |
| | | if (statusCode == 200) { |
| | | return EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); |
| | | } else { |
| | | System.err.println("请求失败,状态码: " + statusCode); |
| | | return null; |
| | | } |
| | | } |
| | | |
| | | System.out.println("拆分完成,共生成" + splitDocuments.size() + "个文件"); |
| | | |
| | | } catch (IOException e) { |
| | | } catch (Exception e) { |
| | | e.printStackTrace(); |
| | | } |
| | | return fileList; |
| | | } |
| | | |
| | | private void getPDFText(Client client, File oneSubFile) throws FileNotFoundException { |
| | | com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest recognizePdfRequest = new com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest(); |
| | | InputStream inputStream = new FileInputStream(oneSubFile); |
| | | recognizePdfRequest.setFileURLObject(inputStream); |
| | | com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions(); |
| | | try { |
| | | com.aliyun.ocr20191230.models.RecognizePdfResponse resp = client.recognizePdfAdvance(recognizePdfRequest, runtime); |
| | | RecognizePdfResponseBody body = resp.getBody(); |
| | | RecognizePdfResponseBody.RecognizePdfResponseBodyData data = body.getData(); |
| | | List<RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo> wordsInfo = data.getWordsInfo(); |
| | | for (RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo recognizePdfResponseBodyDataWordsInfo : wordsInfo) { |
| | | String word = recognizePdfResponseBodyDataWordsInfo.word; |
| | | logger.info("文字:{}", word); |
| | | } |
| | | |
| | | |
| | | com.aliyun.teaconsole.Client.log(com.aliyun.teautil.Common.toJSONString(resp)); |
| | | } catch (TeaException error) { |
| | | error.printStackTrace(); |
| | | logger.info(error.getMessage()); |
| | | // 此处仅做打印展示,请谨慎对待异常处理,在工程项目中切勿直接忽略异常。 |
| | | // 错误 message |
| | | System.out.println(error.getMessage()); |
| | | // 诊断地址 |
| | | System.out.println(error.getData().get("Recommend")); |
| | | com.aliyun.teautil.Common.assertAsString(error.message); |
| | | } catch (Exception _error) { |
| | | TeaException error = new TeaException(_error.getMessage(), _error); |
| | | // 此处仅做打印展示,请谨慎对待异常处理,在工程项目中切勿直接忽略异常。 |
| | | // 错误 message |
| | | System.out.println(error.getMessage()); |
| | | // 诊断地址 |
| | | System.out.println(error.getData().get("Recommend")); |
| | | com.aliyun.teautil.Common.assertAsString(error.message); |
| | | return null; |
| | | } |
| | | } |
| | | |
| | | |
| | | |
| | | public static com.aliyun.ocr20191230.Client createClient() throws Exception { |
| | | // 工程代码建议使用更安全的无AK方式,凭据配置方式请参见:https://help.aliyun.com/document_detail/378657.html。 |
| | | com.aliyun.teaopenapi.models.Config config = new com.aliyun.teaopenapi.models.Config() |
| | | .setAccessKeyId("LTAI5tCSkZYYhkUCsk4v4CCu") |
| | | .setAccessKeySecret("vhJBGvKQKmKFIpUq6WQndYYMwwRaP7"); |
| | | // Endpoint 请参考 https://api.aliyun.com/product/ocr |
| | | config.endpoint = "ocr.cn-shanghai.aliyuncs.com"; |
| | | return new com.aliyun.ocr20191230.Client(config); |
| | | } |
| | | |
| | | } |