Demo/demo_pdf_ai.git

			@@ -1,20 +1,19 @@
			package ai;

			import java.io.File;
			import java.io.FileInputStream;
			import java.io.FileNotFoundException;
			import java.io.IOException;
			import java.io.InputStream;
			import java.util.ArrayList;
			import java.util.List;
			import java.nio.charset.StandardCharsets;

			import org.apache.pdfbox.multipdf.Splitter;
			import org.apache.pdfbox.pdmodel.PDDocument;
			import org.apache.hc.client5.http.classic.methods.HttpGet;
			import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
			import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
			import org.apache.hc.client5.http.impl.classic.HttpClients;
			import org.apache.hc.core5.http.io.entity.EntityUtils;

			import com.aliyun.ocr20191230.Client;
			import com.aliyun.ocr20191230.models.RecognizePdfResponseBody;
			import com.aliyun.tea.TeaException;

			import foundation.dao.DataPackage;
			import foundation.data.entity.Entity;
			import foundation.icall.ICall;
			import foundation.icall.ICallBucket;
			import foundation.icall.ICallCenter;
			import foundation.util.Util;
			import foundation.workflow.ActionProvider;

			public class AiHandler extends ActionProvider {
			@@ -22,110 +21,74 @@
			@Override
			protected void publishMethod() {
			addMethod("pdf");

			addMethod("writerResult");
			}


			public void pdf() throws Exception {
			Client client = createClient();
			String filePath = dataReader.getString("filePath");
			File file = new File(filePath);
			PDDocument document = PDDocument.load(file);
			int numberOfPages = document.getNumberOfPages();


			List<File> fileList = new ArrayList<>();
			if (numberOfPages > 5) {
			fileList.addAll(splitFiles(document));
			} else {
			fileList.add(file);
			}
			document.close();

			for (int i = 0; i < fileList.size(); i++) {
			File oneSubFile = fileList.get(i);
			logger.info("总共：{} 开始读取第{}个文件名：{} ", fileList.size(), i+1, oneSubFile.getName());
			getPDFText(client, oneSubFile);
			}

			AliyunOcrApiDirect ocr = new AliyunOcrApiDirect();

			DataPackage dataPackage = dataReader.getDataPackage();
			dataPackage.loadOneDataFromDB();
			Entity master = dataPackage.getMasterEntity();
			String fileUrl = master.getString("file_url");
			// PDF识别
			String result = ocr.recognizePdf(fileUrl, 1);
			System.out.println("PDF识别结果: " + result);
			}

			private List<File> splitFiles(PDDocument document) {
			List<File> fileList = new ArrayList<>();
			//1 创建拆分器并设置每5页拆分一次
			Splitter splitter = new Splitter();
			splitter.setSplitAtPage(5); // 关键参数设置
			// 3. 执行拆分操作
			try {
			List<PDDocument> splitDocuments = splitter.split(document);

			// 4. 保存拆分后的文件
			String outputDir = "output/"; // 输出目录
			new File(outputDir).mkdirs(); // 创建目录

			for (int i = 0; i < splitDocuments.size(); i++) {
			String outputPath = outputDir + "split_" + (i + 1) + ".pdf";
			splitDocuments.get(i).save(outputPath);
			splitDocuments.get(i).close();
			File file = new File(outputPath);
			fileList.add(file);
			System.out.println("生成文件: " + outputPath);

			public void writerResult() throws Exception {
			DataPackage dataPackage = dataReader.getDataPackage();
			dataPackage.loadOneDataFromDB();
			Entity master = dataPackage.getMasterEntity();

			int index = 0;

			String baiduFileUrl = master.getString("baidu_file_url");
			ICallCenter icallCenter = ICallCenter.getInstance();
			ICallBucket callBucket = ICallBucket.getInstance();
			ICall iCall = callBucket.getOne("document-parser-quary");

			while (Util.isEmpty(baiduFileUrl) && index < 3) {
			step.setDataPackage(dataPackage);
			icallCenter.callRemote(step, iCall);

			dataPackage.loadOneDataFromDB(true);
			master = dataPackage.getMasterEntity();
			index ++;

			Thread.sleep(2000);
			baiduFileUrl = master.getString("baidu_file_url");
			}

			String jsonContent = fetchJsonWithHttpClient(baiduFileUrl);
			AIResult result = new AIResult(jsonContent);
			dataWriter.addValue("content", result);
			dataWriter.addValue("data", master);
			}

			public static String fetchJsonWithHttpClient(String url) {
			try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
			HttpGet request = new HttpGet(url);

			// 设置请求头
			request.setHeader("Accept", "application/json; charset=UTF-8");
			request.setHeader("Accept-Charset", "UTF-8");
			request.setHeader("User-Agent", "Mozilla/5.0");

			try (CloseableHttpResponse response = httpClient.execute(request)) {
			int statusCode = response.getCode();

			if (statusCode == 200) {
			return EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8);
			} else {
			System.err.println("请求失败，状态码: " + statusCode);
			return null;
			}
			}

			System.out.println("拆分完成，共生成" + splitDocuments.size() + "个文件");

			} catch (IOException e) {
			} catch (Exception e) {
			e.printStackTrace();
			}
			return fileList;
			}

			private void getPDFText(Client client, File oneSubFile) throws FileNotFoundException {
			com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest recognizePdfRequest = new com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest();
			InputStream inputStream = new FileInputStream(oneSubFile);
			recognizePdfRequest.setFileURLObject(inputStream);
			com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions();
			try {
			com.aliyun.ocr20191230.models.RecognizePdfResponse resp = client.recognizePdfAdvance(recognizePdfRequest, runtime);
			RecognizePdfResponseBody body = resp.getBody();
			RecognizePdfResponseBody.RecognizePdfResponseBodyData data = body.getData();
			List<RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo> wordsInfo = data.getWordsInfo();
			for (RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo recognizePdfResponseBodyDataWordsInfo : wordsInfo) {
			String word = recognizePdfResponseBodyDataWordsInfo.word;
			logger.info("文字：{}", word);
			}


			com.aliyun.teaconsole.Client.log(com.aliyun.teautil.Common.toJSONString(resp));
			} catch (TeaException error) {
			error.printStackTrace();
			logger.info(error.getMessage());
			// 此处仅做打印展示，请谨慎对待异常处理，在工程项目中切勿直接忽略异常。
			// 错误 message
			System.out.println(error.getMessage());
			// 诊断地址
			System.out.println(error.getData().get("Recommend"));
			com.aliyun.teautil.Common.assertAsString(error.message);
			} catch (Exception _error) {
			TeaException error = new TeaException(_error.getMessage(), _error);
			// 此处仅做打印展示，请谨慎对待异常处理，在工程项目中切勿直接忽略异常。
			// 错误 message
			System.out.println(error.getMessage());
			// 诊断地址
			System.out.println(error.getData().get("Recommend"));
			com.aliyun.teautil.Common.assertAsString(error.message);
			return null;
			}
			}



			public static com.aliyun.ocr20191230.Client createClient() throws Exception {
			// 工程代码建议使用更安全的无AK方式，凭据配置方式请参见：https://help.aliyun.com/document_detail/378657.html。
			com.aliyun.teaopenapi.models.Config config = new com.aliyun.teaopenapi.models.Config()
			.setAccessKeyId("LTAI5tCSkZYYhkUCsk4v4CCu")
			.setAccessKeySecret("vhJBGvKQKmKFIpUq6WQndYYMwwRaP7");
			// Endpoint 请参考 https://api.aliyun.com/product/ocr
			config.endpoint = "ocr.cn-shanghai.aliyuncs.com";
			return new com.aliyun.ocr20191230.Client(config);
			}

			}