From 03d08d0c9e898594bfe27b816e0eb03924fcccf8 Mon Sep 17 00:00:00 2001 From: David Edwards Date: Tue, 13 Apr 2021 09:59:42 +0100 Subject: [PATCH] MNT-22082 transformation of pdf to text hang (#367) A new constructor has been added to the TikaController to provide the new spring config. The creation of the TikaExecutor has been moved to "singleton pattern" as the injection of the @Value happens after the instantiation of the TikaJavaExecutor and does not pass the value correctly. The instantiation is now done once, on the first transform request. Param has been added to the AIO beans. --- .../alfresco/transformer/AIOCustomConfig.java | 5 +- .../main/resources/application-default.yaml | 3 + .../alfresco/transformer/TikaController.java | 17 ++- .../main/resources/application-default.yaml | 5 +- .../transformer/TikaControllerTest.java | 4 +- .../alfresco-transform-tika/pom.xml | 11 ++ .../executors/TikaJavaExecutor.java | 24 +++- .../executors/TikaJavaExecutorTest.java | 126 ++++++++++++++++++ .../transformer/util/RequestParamMap.java | 2 +- docs/external-engine-configuration.md | 2 + 10 files changed, 187 insertions(+), 12 deletions(-) create mode 100644 alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/executors/TikaJavaExecutorTest.java diff --git a/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/java/org/alfresco/transformer/AIOCustomConfig.java b/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/java/org/alfresco/transformer/AIOCustomConfig.java index 65431ac3..a15857d2 100644 --- a/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/java/org/alfresco/transformer/AIOCustomConfig.java +++ b/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/java/org/alfresco/transformer/AIOCustomConfig.java @@ -76,6 +76,9 @@ public class AIOCustomConfig @Value("${transform.core.imagemagick.config}") private String imageMagickConfigPath; + @Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}") + private boolean notExtractBookmarksTextDefault; + /** * * @return Override the TransformRegistryImpl used in {@link AbstractTransformerController} @@ -86,7 +89,7 @@ public class AIOCustomConfig { AIOTransformRegistry aioTransformRegistry = new AIOTransformRegistry(); aioTransformRegistry.registerTransformer(new SelectingTransformer()); - aioTransformRegistry.registerTransformer(new TikaJavaExecutor()); + aioTransformRegistry.registerTransformer(new TikaJavaExecutor(notExtractBookmarksTextDefault)); aioTransformRegistry.registerTransformer(new ImageMagickCommandExecutor(imageMagickExePath, imageMagickDynPath, imageMagickRootPath, imageMagickCodersPath, imageMagickConfigPath)); aioTransformRegistry.registerTransformer(new LibreOfficeJavaExecutor(libreofficePath, libreofficeMaxTasksPerProcess, libreofficeTimeout, libreofficePortNumbers, libreofficeTemplateProfileDir, libreofficeIsEnabled)); aioTransformRegistry.registerTransformer(new PdfRendererCommandExecutor(pdfRendererPath)); diff --git a/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/resources/application-default.yaml b/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/resources/application-default.yaml index fbb650a0..10f46c27 100644 --- a/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/resources/application-default.yaml +++ b/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/resources/application-default.yaml @@ -17,3 +17,6 @@ transform: exe: ${IMAGEMAGICK_EXE:/usr/bin/convert} coders: ${IMAGEMAGICK_CODERS:} config: ${IMAGEMAGICK_CONFIG:} + tika: + pdfBox: + notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false} \ No newline at end of file diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/java/org/alfresco/transformer/TikaController.java b/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/java/org/alfresco/transformer/TikaController.java index 82323940..d6d39b36 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/java/org/alfresco/transformer/TikaController.java +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/java/org/alfresco/transformer/TikaController.java @@ -30,6 +30,7 @@ import org.alfresco.transformer.executors.TikaJavaExecutor; import org.alfresco.transformer.probes.ProbeTestTransform; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Controller; import java.io.File; @@ -67,7 +68,10 @@ public class TikaController extends AbstractTransformerController { private static final Logger logger = LoggerFactory.getLogger(TikaController.class); - private TikaJavaExecutor javaExecutor = new TikaJavaExecutor(); + @Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}") + private boolean notExtractBookmarksTextDefault; + + private TikaJavaExecutor javaExecutor; @Override public String getTransformerName() @@ -102,6 +106,15 @@ public class TikaController extends AbstractTransformerController Map transformOptions, File sourceFile, File targetFile) { transformOptions.put(TRANSFORM_NAME_PARAMETER, transformName); - javaExecutor.transform(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile); + getJavaExecutor().transform(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile); + } + + private TikaJavaExecutor getJavaExecutor() + { + if(javaExecutor==null) + { + javaExecutor = new TikaJavaExecutor(notExtractBookmarksTextDefault); + } + return javaExecutor; } } diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/resources/application-default.yaml b/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/resources/application-default.yaml index 0a4b2765..559c09c3 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/resources/application-default.yaml +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/resources/application-default.yaml @@ -3,4 +3,7 @@ queue: transform: core: config: - location: classpath:tika_engine_config.json \ No newline at end of file + location: classpath:tika_engine_config.json + tika: + pdfBox: + notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false} \ No newline at end of file diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java index bba86431..164b3376 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java @@ -61,7 +61,7 @@ import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_XHTML; import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_XML; import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_ZIP; import static org.alfresco.transformer.util.RequestParamMap.INCLUDE_CONTENTS; -import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK_TEXT; +import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT; import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -585,7 +585,7 @@ public class TikaControllerTest extends AbstractTransformerControllerTest mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true); mockMvc.perform( mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension).param( - NOT_EXTRACT_BOOKMARK_TEXT, "true")) + NOT_EXTRACT_BOOKMARKS_TEXT, "true")) .andExpect(status().is(OK.value())) .andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick." + targetExtension)); diff --git a/alfresco-transform-tika/alfresco-transform-tika/pom.xml b/alfresco-transform-tika/alfresco-transform-tika/pom.xml index e03c13bd..8cc5b1ae 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/pom.xml +++ b/alfresco-transform-tika/alfresco-transform-tika/pom.xml @@ -104,6 +104,17 @@ org.apache.pdfbox pdfbox-tools + + + org.junit.jupiter + junit-jupiter + test + + + org.mockito + mockito-junit-jupiter + test + diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java index ad517c4c..9817cc83 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java @@ -40,6 +40,7 @@ import org.alfresco.transformer.metadataExtractors.TikaAudioMetadataExtractor; import org.alfresco.transformer.metadataExtractors.TikaAutoMetadataExtractor; import org.alfresco.transformer.util.RequestParamMap; import org.apache.tika.exception.TikaException; +import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; import java.io.File; @@ -53,7 +54,7 @@ import static org.alfresco.transformer.executors.Tika.INCLUDE_CONTENTS; import static org.alfresco.transformer.executors.Tika.NOT_EXTRACT_BOOKMARKS_TEXT; import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING; import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE; -import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK_TEXT; +import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT; /** * JavaExecutor implementation for running TIKA transformations. It loads the @@ -61,6 +62,8 @@ import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK */ public class TikaJavaExecutor implements JavaExecutor { + private boolean notExtractBookmarksTextDefault; + private static final String ID = "tika"; public static final String LICENCE = "This transformer uses Tika from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt"; @@ -83,8 +86,9 @@ public class TikaJavaExecutor implements JavaExecutor .put("SamplePoiMetadataEmbedder", new PoiMetadataExtractor()) .build(); - public TikaJavaExecutor() + public TikaJavaExecutor(boolean notExtractBookmarksTextDefault) { + this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault; try { tika = new Tika(); @@ -95,6 +99,11 @@ public class TikaJavaExecutor implements JavaExecutor } } + public TikaJavaExecutor() + { + this(false); + } + @Override public String getTransformerId() { @@ -109,12 +118,17 @@ public class TikaJavaExecutor implements JavaExecutor final boolean includeContents = parseBoolean( transformOptions.getOrDefault(RequestParamMap.INCLUDE_CONTENTS, "false")); final boolean notExtractBookmarksText = parseBoolean( - transformOptions.getOrDefault(NOT_EXTRACT_BOOKMARK_TEXT, "false")); + transformOptions.getOrDefault(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT, String.valueOf(notExtractBookmarksTextDefault))); final String targetEncoding = transformOptions.getOrDefault("targetEncoding", "UTF-8"); - + if(transformOptions.get(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT)==null && notExtractBookmarksTextDefault) + { + LoggerFactory.getLogger(TikaJavaExecutor.class).trace( + "notExtractBookmarksText default value has been overridden to {}", + notExtractBookmarksTextDefault); + } call(sourceFile, targetFile, transformName, includeContents ? INCLUDE_CONTENTS : null, - notExtractBookmarksText ? NOT_EXTRACT_BOOKMARKS_TEXT : null, + notExtractBookmarksText ? Tika.NOT_EXTRACT_BOOKMARKS_TEXT : null, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + targetEncoding); } diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/executors/TikaJavaExecutorTest.java b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/executors/TikaJavaExecutorTest.java new file mode 100644 index 00000000..509a2be7 --- /dev/null +++ b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/executors/TikaJavaExecutorTest.java @@ -0,0 +1,126 @@ +/* + * #%L + * Alfresco Transform Core + * %% + * Copyright (C) 2005 - 2021 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * - + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * - + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * - + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * - + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ +package org.alfresco.transformer.executors; + +import static org.alfresco.transformer.executors.Tika.NOT_EXTRACT_BOOKMARKS_TEXT; +import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING; +import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.clearInvocations; +import static org.mockito.Mockito.lenient; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +public class TikaJavaExecutorTest { + + + @Test + public void testNotExtractBookmarkTextDefault() throws Exception + { + TikaJavaExecutor executorSpyDefaultTrue = spy(new TikaJavaExecutor(true)); + TikaJavaExecutor executorSpyDefaultFalse = spy(new TikaJavaExecutor(false)); + + File mockSourceFile = mock(File.class); + File mockTargetFile = mock(File.class); + String transformName = "transformName"; + String sourceMimetype = "sourceMimetype"; + String targetMimetype = "targetMimetype"; + String defaultEncoding = "UTF-8"; + + // no need to continue execution passed here or check values as we're checking the correct params passed to this method later. + lenient().doNothing().when(executorSpyDefaultTrue).call(any(), any(), any(), any(), any(), any(), any()); + lenient().doNothing().when(executorSpyDefaultFalse).call(any(), any(), any(), any(), any(), any(), any()); + + Map transformOptions = new HashMap(); + + // use empty transformOptions to test defaults + executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, + mockSourceFile, mockTargetFile); + executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, + mockSourceFile, mockTargetFile); + + // when default set to true, with no options passed we should get a call method with NOT_EXTRACT_BOOKMARKS_TEXT + verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, + NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + // when default set to false, with no options passed we should get a call method without NOT_EXTRACT_BOOKMARKS_TEXT + verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null, + TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + // use transforms with notExtractBookmarksText set to true + clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse); + transformOptions.put("notExtractBookmarksText", "true"); + executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, + mockSourceFile, mockTargetFile); + executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, + mockSourceFile, mockTargetFile); + + // both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT + verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, + NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, + NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + // use transforms with notExtractBookmarksText set to false + clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse); + transformOptions.replace("notExtractBookmarksText", "true", "false"); + executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile); + executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile); + + // both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT + verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null, + TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null, + TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + // use full set of pdfbox transformOptions just to be safe + clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse); + transformOptions.put("targetEncoding", "anyEncoding"); + executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile); + executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile); + + // both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT but the encoding will change + verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null, + TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding"); + + verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null, + TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding"); + } +} diff --git a/alfresco-transformer-base/src/main/java/org/alfresco/transformer/util/RequestParamMap.java b/alfresco-transformer-base/src/main/java/org/alfresco/transformer/util/RequestParamMap.java index 2dd44310..544fb31d 100644 --- a/alfresco-transformer-base/src/main/java/org/alfresco/transformer/util/RequestParamMap.java +++ b/alfresco-transformer-base/src/main/java/org/alfresco/transformer/util/RequestParamMap.java @@ -70,6 +70,6 @@ public interface RequestParamMap String COMMAND_OPTIONS = "commandOptions"; String TIMEOUT = "timeout"; String INCLUDE_CONTENTS = "includeContents"; - String NOT_EXTRACT_BOOKMARK_TEXT = "notExtractBookmarksText"; + String NOT_EXTRACT_BOOKMARKS_TEXT = "notExtractBookmarksText"; String PAGE_LIMIT = "pageLimit"; } diff --git a/docs/external-engine-configuration.md b/docs/external-engine-configuration.md index 404fad0d..fa9050ce 100644 --- a/docs/external-engine-configuration.md +++ b/docs/external-engine-configuration.md @@ -13,6 +13,7 @@ The following externalized T-engines properties are available: | ACTIVEMQ_USER | ActiveMQ User. | admin | | ACTIVEMQ_PASSWORD | ActiveMQ Password. | admin | | FILE_STORE_URL | T-Engine Port. | http://localhost:8099/alfresco/api/-default-/private/sfs/versions/1/file | +| PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT | The default behaviour for notExtractBookmarksText when this request param is omitted from a request. | false | | TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for receiving async requests. | org.alfresco.transform.engine.tika.acs | @@ -81,6 +82,7 @@ The following externalized T-engines properties are available: | ACTIVEMQ_USER | ActiveMQ User. | admin | | ACTIVEMQ_PASSWORD | ActiveMQ Password. | admin | | FILE_STORE_URL | T-Engine Port. | http://localhost:8099/alfresco/api/-default-/private/sfs/versions/1/file | +| PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT | The default behaviour for notExtractBookmarksText when this request param is omitted from a request. | false | | TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for async requests. | org.alfresco.transform.engine.aio.acs | | PDFRENDERER_EXE | Path to Pdf-renderer EXE. | /usr/bin/alfresco-pdf-renderer | | TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for async requests. | org.alfresco.transform.engine.libreoffice.acs |