package ai;
|
|
import java.io.File;
|
import java.io.FileInputStream;
|
import java.io.FileNotFoundException;
|
import java.io.IOException;
|
import java.io.InputStream;
|
import java.util.ArrayList;
|
import java.util.List;
|
|
import org.apache.pdfbox.multipdf.Splitter;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import com.aliyun.ocr20191230.Client;
|
import com.aliyun.ocr20191230.models.RecognizePdfResponseBody;
|
import com.aliyun.tea.TeaException;
|
|
import foundation.workflow.ActionProvider;
|
|
public class AiHandler extends ActionProvider {
|
|
@Override
|
protected void publishMethod() {
|
addMethod("pdf");
|
}
|
|
|
public void pdf() throws Exception {
|
Client client = createClient();
|
String filePath = dataReader.getString("filePath");
|
File file = new File(filePath);
|
PDDocument document = PDDocument.load(file);
|
int numberOfPages = document.getNumberOfPages();
|
|
|
List<File> fileList = new ArrayList<>();
|
if (numberOfPages > 5) {
|
fileList.addAll(splitFiles(document));
|
} else {
|
fileList.add(file);
|
}
|
document.close();
|
|
for (int i = 0; i < fileList.size(); i++) {
|
File oneSubFile = fileList.get(i);
|
logger.info("总共:{} 开始读取第{}个 文件名:{} ", fileList.size(), i+1, oneSubFile.getName());
|
getPDFText(client, oneSubFile);
|
}
|
|
}
|
|
private List<File> splitFiles(PDDocument document) {
|
List<File> fileList = new ArrayList<>();
|
//1 创建拆分器并设置每5页拆分一次
|
Splitter splitter = new Splitter();
|
splitter.setSplitAtPage(5); // 关键参数设置
|
// 3. 执行拆分操作
|
try {
|
List<PDDocument> splitDocuments = splitter.split(document);
|
|
// 4. 保存拆分后的文件
|
String outputDir = "output/"; // 输出目录
|
new File(outputDir).mkdirs(); // 创建目录
|
|
for (int i = 0; i < splitDocuments.size(); i++) {
|
String outputPath = outputDir + "split_" + (i + 1) + ".pdf";
|
splitDocuments.get(i).save(outputPath);
|
splitDocuments.get(i).close();
|
File file = new File(outputPath);
|
fileList.add(file);
|
System.out.println("生成文件: " + outputPath);
|
}
|
|
System.out.println("拆分完成,共生成" + splitDocuments.size() + "个文件");
|
|
} catch (IOException e) {
|
e.printStackTrace();
|
}
|
return fileList;
|
}
|
|
private void getPDFText(Client client, File oneSubFile) throws FileNotFoundException {
|
com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest recognizePdfRequest = new com.aliyun.ocr20191230.models.RecognizePdfAdvanceRequest();
|
InputStream inputStream = new FileInputStream(oneSubFile);
|
recognizePdfRequest.setFileURLObject(inputStream);
|
com.aliyun.teautil.models.RuntimeOptions runtime = new com.aliyun.teautil.models.RuntimeOptions();
|
try {
|
com.aliyun.ocr20191230.models.RecognizePdfResponse resp = client.recognizePdfAdvance(recognizePdfRequest, runtime);
|
RecognizePdfResponseBody body = resp.getBody();
|
RecognizePdfResponseBody.RecognizePdfResponseBodyData data = body.getData();
|
List<RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo> wordsInfo = data.getWordsInfo();
|
for (RecognizePdfResponseBody.RecognizePdfResponseBodyDataWordsInfo recognizePdfResponseBodyDataWordsInfo : wordsInfo) {
|
String word = recognizePdfResponseBodyDataWordsInfo.word;
|
logger.info("文字:{}", word);
|
}
|
|
|
com.aliyun.teaconsole.Client.log(com.aliyun.teautil.Common.toJSONString(resp));
|
} catch (TeaException error) {
|
error.printStackTrace();
|
logger.info(error.getMessage());
|
// 此处仅做打印展示,请谨慎对待异常处理,在工程项目中切勿直接忽略异常。
|
// 错误 message
|
System.out.println(error.getMessage());
|
// 诊断地址
|
System.out.println(error.getData().get("Recommend"));
|
com.aliyun.teautil.Common.assertAsString(error.message);
|
} catch (Exception _error) {
|
TeaException error = new TeaException(_error.getMessage(), _error);
|
// 此处仅做打印展示,请谨慎对待异常处理,在工程项目中切勿直接忽略异常。
|
// 错误 message
|
System.out.println(error.getMessage());
|
// 诊断地址
|
System.out.println(error.getData().get("Recommend"));
|
com.aliyun.teautil.Common.assertAsString(error.message);
|
}
|
}
|
|
|
|
public static com.aliyun.ocr20191230.Client createClient() throws Exception {
|
// 工程代码建议使用更安全的无AK方式,凭据配置方式请参见:https://help.aliyun.com/document_detail/378657.html。
|
com.aliyun.teaopenapi.models.Config config = new com.aliyun.teaopenapi.models.Config()
|
.setAccessKeyId("LTAI5tCSkZYYhkUCsk4v4CCu")
|
.setAccessKeySecret("vhJBGvKQKmKFIpUq6WQndYYMwwRaP7");
|
// Endpoint 请参考 https://api.aliyun.com/product/ocr
|
config.endpoint = "ocr.cn-shanghai.aliyuncs.com";
|
return new com.aliyun.ocr20191230.Client(config);
|
}
|
|
}
|