From 56638c01bb2cc61a92f5e03c9a1001be5b5d3699 Mon Sep 17 00:00:00 2001
From: P15GEN2\59518 <lilith@highdatas.com>
Date: 星期六, 18 十月 2025 12:39:31 +0800
Subject: [PATCH] dev 数据清洗

---
 ai/src/ai/AiHandler.java |  185 ++++++++++++++++++---------------------------
 1 files changed, 74 insertions(+), 111 deletions(-)

diff --git a/ai/src/ai/AiHandler.java b/ai/src/ai/AiHandler.java
index 03abde8..ca1805d 100644
--- a/ai/src/ai/AiHandler.java
+++ b/ai/src/ai/AiHandler.java
@@ -1,20 +1,19 @@
 package ai;
 
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
+import java.nio.charset.StandardCharsets;
 
-import org.apache.pdfbox.multipdf.Splitter;
-import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
+import org.apache.hc.client5.http.impl.classic.HttpClients;
+import org.apache.hc.core5.http.io.entity.EntityUtils;
 
-import com.aliyun.ocr20191230.Client;
-import com.aliyun.ocr20191230.models.RecognizePdfResponseBody;
-import com.aliyun.tea.TeaException;
-
+import foundation.dao.DataPackage;
+import foundation.data.entity.Entity;
+import foundation.icall.ICall;
+import foundation.icall.ICallBucket;
+import foundation.icall.ICallCenter;
+import foundation.util.Util;
 import foundation.workflow.ActionProvider;
 
 public class AiHandler extends ActionProvider {
@@ -22,110 +21,74 @@
 	@Override
 	protected void publishMethod() {
 		addMethod("pdf");
+		
+		addMethod("writerResult");
 	}
-	
 	
 	public void pdf() throws Exception {
-		Client client = createClient();
-        String filePath = dataReader.getString("filePath");
-        File file = new File(filePath);
-        PDDocument document = PDDocument.load(file);
-        int numberOfPages = document.getNumberOfPages();
-
-
-        List<File> fileList = new ArrayList<>();
-        if (numberOfPages > 5) {
-            fileList.addAll(splitFiles(document));
-        } else {
-            fileList.add(file);
-        }
-        document.close();
-
-        for (int i = 0; i < fileList.size(); i++) {
-            File oneSubFile = fileList.get(i);
-            logger.info("鎬诲叡锛歿} 寮�濮嬭鍙栫{}涓� 鏂囦欢鍚嶏細{} ", fileList.size(), i+1, oneSubFile.getName());
-            getPDFText(client, oneSubFile);
-        }
-
+		 AliyunOcrApiDirect ocr = new AliyunOcrApiDirect();
+		 
+		 DataPackage dataPackage = dataReader.getDataPackage();
+		 dataPackage.loadOneDataFromDB();
+		 Entity master = dataPackage.getMasterEntity();
+		 String fileUrl = master.getString("file_url");
+		 // PDF璇嗗埆
+		 String result = ocr.recognizePdf(fileUrl, 1);
+		 System.out.println("PDF璇嗗埆缁撴灉: " + result);
 	}
-
-	private List<File> splitFiles(PDDocument document) {
-        List<File> fileList = new ArrayList<>();
-        //1 鍒涘缓鎷嗗垎鍣ㄥ苟璁剧疆姣�5椤垫媶鍒嗕竴娆�
-        Splitter splitter = new Splitter();
-        splitter.setSplitAtPage(5); // 鍏抽敭鍙傛暟璁剧疆
-        // 3. 鎵ц鎷嗗垎鎿嶄綔
-        try {
-            List<PDDocument> splitDocuments = splitter.split(document);
-
-            // 4. 淇濆瓨鎷嗗垎鍚庣殑鏂囦欢
-            String outputDir = "output/"; // 杈撳嚭鐩綍
-            new File(outputDir).mkdirs(); // 鍒涘缓鐩綍
-
-            for (int i = 0; i < splitDocuments.size(); i++) {
-                String outputPath = outputDir + "split_" + (i + 1) + ".pdf";
-                splitDocuments.get(i).save(outputPath);
-                splitDocuments.get(i).close();
-                File file = new File(outputPath);
-                fileList.add(file);
-                System.out.println("鐢熸垚鏂囦欢: " + outputPath);
+	 
+	public void writerResult() throws Exception {
+		DataPackage dataPackage = dataReader.getDataPackage();
+		dataPackage.loadOneDataFromDB();
+		Entity master = dataPackage.getMasterEntity();
+		
+		int index = 0;
+		
+		String baiduFileUrl = master.getString("baidu_file_url");
+		ICallCenter icallCenter = ICallCenter.getInstance();
+		ICallBucket callBucket = ICallBucket.getInstance();
+		ICall iCall = callBucket.getOne("document-parser-quary");
+		
+		while (Util.isEmpty(baiduFileUrl) && index < 3) {
+			step.setDataPackage(dataPackage);
+			icallCenter.callRemote(step, iCall);
+			
+			dataPackage.loadOneDataFromDB(true);
+			master = dataPackage.getMasterEntity();
+			index ++;
+			
+			Thread.sleep(2000);
+			baiduFileUrl = master.getString("baidu_file_url");
+		}
+		
+		String jsonContent = fetchJsonWithHttpClient(baiduFileUrl);
+		AIResult result = new AIResult(jsonContent);
+		dataWriter.addValue("content", result);
+		dataWriter.addValue("data", master);
+	}
+	
+	public static String fetchJsonWithHttpClient(String url) {
+        try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
+            HttpGet request = new HttpGet(url);
+            
+            // 璁剧疆璇锋眰澶�
+            request.setHeader("Accept", "application/json; charset=UTF-8");
+            request.setHeader("Accept-Charset", "UTF-8");
+            request.setHeader("User-Agent", "Mozilla/5.0");
+            
+            try (CloseableHttpResponse response = httpClient.execute(request)) {
+                int statusCode = response.getCode();
+                
+                if (statusCode == 200) {
+                    return EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8);
+                } else {
+                    System.err.println("璇锋眰澶辫触锛岀姸鎬佺爜: " + statusCode);
+                    return null;
+                }
             }
-
-            System.out.println("鎷嗗垎瀹屾垚锛屽叡鐢熸垚" + splitDocuments.size() + "涓枃浠�");
-
-        } catch (IOException e) {
+        } catch (Exception e) {
             e.printStackTrace();
-        }
-        return fileList;
-    }
-
-    private void getPDFText(Client client, File oneSubFile) throws FileNotFoundException {
-        com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest recognizePdfRequest = new com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest();
-        InputStream inputStream = new FileInputStream(oneSubFile);
-        recognizePdfRequest.setFileURLObject(inputStream);
-        com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions();
-        try {
-            com.aliyun.ocr20191230.models.RecognizePdfResponse resp = client.recognizePdfAdvance(recognizePdfRequest, runtime);
-            RecognizePdfResponseBody body = resp.getBody();
-            RecognizePdfResponseBody.RecognizePdfResponseBodyData data = body.getData();
-            List<RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo> wordsInfo = data.getWordsInfo();
-            for (RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo recognizePdfResponseBodyDataWordsInfo : wordsInfo) {
-                String word = recognizePdfResponseBodyDataWordsInfo.word;
-                logger.info("鏂囧瓧锛歿}", word);
-            }
-
-
-            com.aliyun.teaconsole.Client.log(com.aliyun.teautil.Common.toJSONString(resp));
-        } catch (TeaException error) {
-            error.printStackTrace();
-            logger.info(error.getMessage());
-            // 姝ゅ浠呭仛鎵撳嵃灞曠ず锛岃璋ㄦ厧瀵瑰緟寮傚父澶勭悊锛屽湪宸ョ▼椤圭洰涓垏鍕跨洿鎺ュ拷鐣ュ紓甯搞��
-            // 閿欒 message
-            System.out.println(error.getMessage());
-            // 璇婃柇鍦板潃
-            System.out.println(error.getData().get("Recommend"));
-            com.aliyun.teautil.Common.assertAsString(error.message);
-        } catch (Exception _error) {
-            TeaException error = new TeaException(_error.getMessage(), _error);
-            // 姝ゅ浠呭仛鎵撳嵃灞曠ず锛岃璋ㄦ厧瀵瑰緟寮傚父澶勭悊锛屽湪宸ョ▼椤圭洰涓垏鍕跨洿鎺ュ拷鐣ュ紓甯搞��
-            // 閿欒 message
-            System.out.println(error.getMessage());
-            // 璇婃柇鍦板潃
-            System.out.println(error.getData().get("Recommend"));
-            com.aliyun.teautil.Common.assertAsString(error.message);
+            return null;
         }
     }
-	
-	
-	
-	  public static com.aliyun.ocr20191230.Client createClient() throws Exception {
-	        // 宸ョ▼浠g爜寤鸿浣跨敤鏇村畨鍏ㄧ殑鏃燗K鏂瑰紡锛屽嚟鎹厤缃柟寮忚鍙傝锛歨ttps://help.aliyun.com/document_detail/378657.html銆�
-	        com.aliyun.teaopenapi.models.Config config = new com.aliyun.teaopenapi.models.Config()
-	                .setAccessKeyId("LTAI5tCSkZYYhkUCsk4v4CCu")
-	                .setAccessKeySecret("vhJBGvKQKmKFIpUq6WQndYYMwwRaP7");
-	        // Endpoint 璇峰弬鑰� https://api.aliyun.com/product/ocr
-	        config.endpoint = "ocr.cn-shanghai.aliyuncs.com";
-	        return new com.aliyun.ocr20191230.Client(config);
-	    }
-	
 }

--
Gitblit v1.8.0