From 56638c01bb2cc61a92f5e03c9a1001be5b5d3699 Mon Sep 17 00:00:00 2001
From: P15GEN2\59518 <lilith@highdatas.com>
Date: 星期六, 18 十月 2025 12:39:31 +0800
Subject: [PATCH] dev 数据清洗
---
ai/src/ai/AiHandler.java | 185 ++++++++++++++++++---------------------------
1 files changed, 74 insertions(+), 111 deletions(-)
diff --git a/ai/src/ai/AiHandler.java b/ai/src/ai/AiHandler.java
index 03abde8..ca1805d 100644
--- a/ai/src/ai/AiHandler.java
+++ b/ai/src/ai/AiHandler.java
@@ -1,20 +1,19 @@
package ai;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
+import java.nio.charset.StandardCharsets;
-import org.apache.pdfbox.multipdf.Splitter;
-import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
+import org.apache.hc.client5.http.impl.classic.HttpClients;
+import org.apache.hc.core5.http.io.entity.EntityUtils;
-import com.aliyun.ocr20191230.Client;
-import com.aliyun.ocr20191230.models.RecognizePdfResponseBody;
-import com.aliyun.tea.TeaException;
-
+import foundation.dao.DataPackage;
+import foundation.data.entity.Entity;
+import foundation.icall.ICall;
+import foundation.icall.ICallBucket;
+import foundation.icall.ICallCenter;
+import foundation.util.Util;
import foundation.workflow.ActionProvider;
public class AiHandler extends ActionProvider {
@@ -22,110 +21,74 @@
@Override
protected void publishMethod() {
addMethod("pdf");
+
+ addMethod("writerResult");
}
-
public void pdf() throws Exception {
- Client client = createClient();
- String filePath = dataReader.getString("filePath");
- File file = new File(filePath);
- PDDocument document = PDDocument.load(file);
- int numberOfPages = document.getNumberOfPages();
-
-
- List<File> fileList = new ArrayList<>();
- if (numberOfPages > 5) {
- fileList.addAll(splitFiles(document));
- } else {
- fileList.add(file);
- }
- document.close();
-
- for (int i = 0; i < fileList.size(); i++) {
- File oneSubFile = fileList.get(i);
- logger.info("鎬诲叡锛歿} 寮�濮嬭鍙栫{}涓� 鏂囦欢鍚嶏細{} ", fileList.size(), i+1, oneSubFile.getName());
- getPDFText(client, oneSubFile);
- }
-
+ AliyunOcrApiDirect ocr = new AliyunOcrApiDirect();
+
+ DataPackage dataPackage = dataReader.getDataPackage();
+ dataPackage.loadOneDataFromDB();
+ Entity master = dataPackage.getMasterEntity();
+ String fileUrl = master.getString("file_url");
+ // PDF璇嗗埆
+ String result = ocr.recognizePdf(fileUrl, 1);
+ System.out.println("PDF璇嗗埆缁撴灉: " + result);
}
-
- private List<File> splitFiles(PDDocument document) {
- List<File> fileList = new ArrayList<>();
- //1 鍒涘缓鎷嗗垎鍣ㄥ苟璁剧疆姣�5椤垫媶鍒嗕竴娆�
- Splitter splitter = new Splitter();
- splitter.setSplitAtPage(5); // 鍏抽敭鍙傛暟璁剧疆
- // 3. 鎵ц鎷嗗垎鎿嶄綔
- try {
- List<PDDocument> splitDocuments = splitter.split(document);
-
- // 4. 淇濆瓨鎷嗗垎鍚庣殑鏂囦欢
- String outputDir = "output/"; // 杈撳嚭鐩綍
- new File(outputDir).mkdirs(); // 鍒涘缓鐩綍
-
- for (int i = 0; i < splitDocuments.size(); i++) {
- String outputPath = outputDir + "split_" + (i + 1) + ".pdf";
- splitDocuments.get(i).save(outputPath);
- splitDocuments.get(i).close();
- File file = new File(outputPath);
- fileList.add(file);
- System.out.println("鐢熸垚鏂囦欢: " + outputPath);
+
+ public void writerResult() throws Exception {
+ DataPackage dataPackage = dataReader.getDataPackage();
+ dataPackage.loadOneDataFromDB();
+ Entity master = dataPackage.getMasterEntity();
+
+ int index = 0;
+
+ String baiduFileUrl = master.getString("baidu_file_url");
+ ICallCenter icallCenter = ICallCenter.getInstance();
+ ICallBucket callBucket = ICallBucket.getInstance();
+ ICall iCall = callBucket.getOne("document-parser-quary");
+
+ while (Util.isEmpty(baiduFileUrl) && index < 3) {
+ step.setDataPackage(dataPackage);
+ icallCenter.callRemote(step, iCall);
+
+ dataPackage.loadOneDataFromDB(true);
+ master = dataPackage.getMasterEntity();
+ index ++;
+
+ Thread.sleep(2000);
+ baiduFileUrl = master.getString("baidu_file_url");
+ }
+
+ String jsonContent = fetchJsonWithHttpClient(baiduFileUrl);
+ AIResult result = new AIResult(jsonContent);
+ dataWriter.addValue("content", result);
+ dataWriter.addValue("data", master);
+ }
+
+ public static String fetchJsonWithHttpClient(String url) {
+ try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
+ HttpGet request = new HttpGet(url);
+
+ // 璁剧疆璇锋眰澶�
+ request.setHeader("Accept", "application/json; charset=UTF-8");
+ request.setHeader("Accept-Charset", "UTF-8");
+ request.setHeader("User-Agent", "Mozilla/5.0");
+
+ try (CloseableHttpResponse response = httpClient.execute(request)) {
+ int statusCode = response.getCode();
+
+ if (statusCode == 200) {
+ return EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8);
+ } else {
+ System.err.println("璇锋眰澶辫触锛岀姸鎬佺爜: " + statusCode);
+ return null;
+ }
}
-
- System.out.println("鎷嗗垎瀹屾垚锛屽叡鐢熸垚" + splitDocuments.size() + "涓枃浠�");
-
- } catch (IOException e) {
+ } catch (Exception e) {
e.printStackTrace();
- }
- return fileList;
- }
-
- private void getPDFText(Client client, File oneSubFile) throws FileNotFoundException {
- com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest recognizePdfRequest = new com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest();
- InputStream inputStream = new FileInputStream(oneSubFile);
- recognizePdfRequest.setFileURLObject(inputStream);
- com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions();
- try {
- com.aliyun.ocr20191230.models.RecognizePdfResponse resp = client.recognizePdfAdvance(recognizePdfRequest, runtime);
- RecognizePdfResponseBody body = resp.getBody();
- RecognizePdfResponseBody.RecognizePdfResponseBodyData data = body.getData();
- List<RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo> wordsInfo = data.getWordsInfo();
- for (RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo recognizePdfResponseBodyDataWordsInfo : wordsInfo) {
- String word = recognizePdfResponseBodyDataWordsInfo.word;
- logger.info("鏂囧瓧锛歿}", word);
- }
-
-
- com.aliyun.teaconsole.Client.log(com.aliyun.teautil.Common.toJSONString(resp));
- } catch (TeaException error) {
- error.printStackTrace();
- logger.info(error.getMessage());
- // 姝ゅ浠呭仛鎵撳嵃灞曠ず锛岃璋ㄦ厧瀵瑰緟寮傚父澶勭悊锛屽湪宸ョ▼椤圭洰涓垏鍕跨洿鎺ュ拷鐣ュ紓甯搞��
- // 閿欒 message
- System.out.println(error.getMessage());
- // 璇婃柇鍦板潃
- System.out.println(error.getData().get("Recommend"));
- com.aliyun.teautil.Common.assertAsString(error.message);
- } catch (Exception _error) {
- TeaException error = new TeaException(_error.getMessage(), _error);
- // 姝ゅ浠呭仛鎵撳嵃灞曠ず锛岃璋ㄦ厧瀵瑰緟寮傚父澶勭悊锛屽湪宸ョ▼椤圭洰涓垏鍕跨洿鎺ュ拷鐣ュ紓甯搞��
- // 閿欒 message
- System.out.println(error.getMessage());
- // 璇婃柇鍦板潃
- System.out.println(error.getData().get("Recommend"));
- com.aliyun.teautil.Common.assertAsString(error.message);
+ return null;
}
}
-
-
-
- public static com.aliyun.ocr20191230.Client createClient() throws Exception {
- // 宸ョ▼浠g爜寤鸿浣跨敤鏇村畨鍏ㄧ殑鏃燗K鏂瑰紡锛屽嚟鎹厤缃柟寮忚鍙傝锛歨ttps://help.aliyun.com/document_detail/378657.html銆�
- com.aliyun.teaopenapi.models.Config config = new com.aliyun.teaopenapi.models.Config()
- .setAccessKeyId("LTAI5tCSkZYYhkUCsk4v4CCu")
- .setAccessKeySecret("vhJBGvKQKmKFIpUq6WQndYYMwwRaP7");
- // Endpoint 璇峰弬鑰� https://api.aliyun.com/product/ocr
- config.endpoint = "ocr.cn-shanghai.aliyuncs.com";
- return new com.aliyun.ocr20191230.Client(config);
- }
-
}
--
Gitblit v1.8.0