diff --git a/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/java/org/alfresco/transformer/AIOCustomConfig.java b/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/java/org/alfresco/transformer/AIOCustomConfig.java index 65431ac3..a15857d2 100644 --- a/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/java/org/alfresco/transformer/AIOCustomConfig.java +++ b/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/java/org/alfresco/transformer/AIOCustomConfig.java @@ -76,6 +76,9 @@ public class AIOCustomConfig @Value("${transform.core.imagemagick.config}") private String imageMagickConfigPath; + @Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}") + private boolean notExtractBookmarksTextDefault; + /** * * @return Override the TransformRegistryImpl used in {@link AbstractTransformerController} @@ -86,7 +89,7 @@ public class AIOCustomConfig { AIOTransformRegistry aioTransformRegistry = new AIOTransformRegistry(); aioTransformRegistry.registerTransformer(new SelectingTransformer()); - aioTransformRegistry.registerTransformer(new TikaJavaExecutor()); + aioTransformRegistry.registerTransformer(new TikaJavaExecutor(notExtractBookmarksTextDefault)); aioTransformRegistry.registerTransformer(new ImageMagickCommandExecutor(imageMagickExePath, imageMagickDynPath, imageMagickRootPath, imageMagickCodersPath, imageMagickConfigPath)); aioTransformRegistry.registerTransformer(new LibreOfficeJavaExecutor(libreofficePath, libreofficeMaxTasksPerProcess, libreofficeTimeout, libreofficePortNumbers, libreofficeTemplateProfileDir, libreofficeIsEnabled)); aioTransformRegistry.registerTransformer(new PdfRendererCommandExecutor(pdfRendererPath)); diff --git a/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/resources/application-default.yaml b/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/resources/application-default.yaml index fbb650a0..10f46c27 100644 --- a/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/resources/application-default.yaml +++ b/alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/main/resources/application-default.yaml @@ -17,3 +17,6 @@ transform: exe: ${IMAGEMAGICK_EXE:/usr/bin/convert} coders: ${IMAGEMAGICK_CODERS:} config: ${IMAGEMAGICK_CONFIG:} + tika: + pdfBox: + notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false} \ No newline at end of file diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/java/org/alfresco/transformer/TikaController.java b/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/java/org/alfresco/transformer/TikaController.java index 82323940..d6d39b36 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/java/org/alfresco/transformer/TikaController.java +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/java/org/alfresco/transformer/TikaController.java @@ -30,6 +30,7 @@ import org.alfresco.transformer.executors.TikaJavaExecutor; import org.alfresco.transformer.probes.ProbeTestTransform; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Controller; import java.io.File; @@ -67,7 +68,10 @@ public class TikaController extends AbstractTransformerController { private static final Logger logger = LoggerFactory.getLogger(TikaController.class); - private TikaJavaExecutor javaExecutor = new TikaJavaExecutor(); + @Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}") + private boolean notExtractBookmarksTextDefault; + + private TikaJavaExecutor javaExecutor; @Override public String getTransformerName() @@ -102,6 +106,15 @@ public class TikaController extends AbstractTransformerController Map transformOptions, File sourceFile, File targetFile) { transformOptions.put(TRANSFORM_NAME_PARAMETER, transformName); - javaExecutor.transform(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile); + getJavaExecutor().transform(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile); + } + + private TikaJavaExecutor getJavaExecutor() + { + if(javaExecutor==null) + { + javaExecutor = new TikaJavaExecutor(notExtractBookmarksTextDefault); + } + return javaExecutor; } } diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/resources/application-default.yaml b/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/resources/application-default.yaml index 0a4b2765..559c09c3 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/resources/application-default.yaml +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/main/resources/application-default.yaml @@ -3,4 +3,7 @@ queue: transform: core: config: - location: classpath:tika_engine_config.json \ No newline at end of file + location: classpath:tika_engine_config.json + tika: + pdfBox: + notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false} \ No newline at end of file diff --git a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java index bba86431..164b3376 100644 --- a/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java +++ b/alfresco-transform-tika/alfresco-transform-tika-boot/src/test/java/org/alfresco/transformer/TikaControllerTest.java @@ -61,7 +61,7 @@ import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_XHTML; import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_XML; import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_ZIP; import static org.alfresco.transformer.util.RequestParamMap.INCLUDE_CONTENTS; -import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK_TEXT; +import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT; import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -585,7 +585,7 @@ public class TikaControllerTest extends AbstractTransformerControllerTest mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true); mockMvc.perform( mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension).param( - NOT_EXTRACT_BOOKMARK_TEXT, "true")) + NOT_EXTRACT_BOOKMARKS_TEXT, "true")) .andExpect(status().is(OK.value())) .andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick." + targetExtension)); diff --git a/alfresco-transform-tika/alfresco-transform-tika/pom.xml b/alfresco-transform-tika/alfresco-transform-tika/pom.xml index e03c13bd..8cc5b1ae 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/pom.xml +++ b/alfresco-transform-tika/alfresco-transform-tika/pom.xml @@ -104,6 +104,17 @@ org.apache.pdfbox pdfbox-tools + + + org.junit.jupiter + junit-jupiter + test + + + org.mockito + mockito-junit-jupiter + test + diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java index ad517c4c..9817cc83 100644 --- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java +++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/executors/TikaJavaExecutor.java @@ -40,6 +40,7 @@ import org.alfresco.transformer.metadataExtractors.TikaAudioMetadataExtractor; import org.alfresco.transformer.metadataExtractors.TikaAutoMetadataExtractor; import org.alfresco.transformer.util.RequestParamMap; import org.apache.tika.exception.TikaException; +import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; import java.io.File; @@ -53,7 +54,7 @@ import static org.alfresco.transformer.executors.Tika.INCLUDE_CONTENTS; import static org.alfresco.transformer.executors.Tika.NOT_EXTRACT_BOOKMARKS_TEXT; import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING; import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE; -import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK_TEXT; +import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT; /** * JavaExecutor implementation for running TIKA transformations. It loads the @@ -61,6 +62,8 @@ import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK */ public class TikaJavaExecutor implements JavaExecutor { + private boolean notExtractBookmarksTextDefault; + private static final String ID = "tika"; public static final String LICENCE = "This transformer uses Tika from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt"; @@ -83,8 +86,9 @@ public class TikaJavaExecutor implements JavaExecutor .put("SamplePoiMetadataEmbedder", new PoiMetadataExtractor()) .build(); - public TikaJavaExecutor() + public TikaJavaExecutor(boolean notExtractBookmarksTextDefault) { + this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault; try { tika = new Tika(); @@ -95,6 +99,11 @@ public class TikaJavaExecutor implements JavaExecutor } } + public TikaJavaExecutor() + { + this(false); + } + @Override public String getTransformerId() { @@ -109,12 +118,17 @@ public class TikaJavaExecutor implements JavaExecutor final boolean includeContents = parseBoolean( transformOptions.getOrDefault(RequestParamMap.INCLUDE_CONTENTS, "false")); final boolean notExtractBookmarksText = parseBoolean( - transformOptions.getOrDefault(NOT_EXTRACT_BOOKMARK_TEXT, "false")); + transformOptions.getOrDefault(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT, String.valueOf(notExtractBookmarksTextDefault))); final String targetEncoding = transformOptions.getOrDefault("targetEncoding", "UTF-8"); - + if(transformOptions.get(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT)==null && notExtractBookmarksTextDefault) + { + LoggerFactory.getLogger(TikaJavaExecutor.class).trace( + "notExtractBookmarksText default value has been overridden to {}", + notExtractBookmarksTextDefault); + } call(sourceFile, targetFile, transformName, includeContents ? INCLUDE_CONTENTS : null, - notExtractBookmarksText ? NOT_EXTRACT_BOOKMARKS_TEXT : null, + notExtractBookmarksText ? Tika.NOT_EXTRACT_BOOKMARKS_TEXT : null, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + targetEncoding); } diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/executors/TikaJavaExecutorTest.java b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/executors/TikaJavaExecutorTest.java new file mode 100644 index 00000000..509a2be7 --- /dev/null +++ b/alfresco-transform-tika/alfresco-transform-tika/src/test/java/org/alfresco/transformer/executors/TikaJavaExecutorTest.java @@ -0,0 +1,126 @@ +/* + * #%L + * Alfresco Transform Core + * %% + * Copyright (C) 2005 - 2021 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * - + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * - + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * - + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * - + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ +package org.alfresco.transformer.executors; + +import static org.alfresco.transformer.executors.Tika.NOT_EXTRACT_BOOKMARKS_TEXT; +import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING; +import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.clearInvocations; +import static org.mockito.Mockito.lenient; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +public class TikaJavaExecutorTest { + + + @Test + public void testNotExtractBookmarkTextDefault() throws Exception + { + TikaJavaExecutor executorSpyDefaultTrue = spy(new TikaJavaExecutor(true)); + TikaJavaExecutor executorSpyDefaultFalse = spy(new TikaJavaExecutor(false)); + + File mockSourceFile = mock(File.class); + File mockTargetFile = mock(File.class); + String transformName = "transformName"; + String sourceMimetype = "sourceMimetype"; + String targetMimetype = "targetMimetype"; + String defaultEncoding = "UTF-8"; + + // no need to continue execution passed here or check values as we're checking the correct params passed to this method later. + lenient().doNothing().when(executorSpyDefaultTrue).call(any(), any(), any(), any(), any(), any(), any()); + lenient().doNothing().when(executorSpyDefaultFalse).call(any(), any(), any(), any(), any(), any(), any()); + + Map transformOptions = new HashMap(); + + // use empty transformOptions to test defaults + executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, + mockSourceFile, mockTargetFile); + executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, + mockSourceFile, mockTargetFile); + + // when default set to true, with no options passed we should get a call method with NOT_EXTRACT_BOOKMARKS_TEXT + verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, + NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + // when default set to false, with no options passed we should get a call method without NOT_EXTRACT_BOOKMARKS_TEXT + verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null, + TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + // use transforms with notExtractBookmarksText set to true + clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse); + transformOptions.put("notExtractBookmarksText", "true"); + executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, + mockSourceFile, mockTargetFile); + executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, + mockSourceFile, mockTargetFile); + + // both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT + verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, + NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, + NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + // use transforms with notExtractBookmarksText set to false + clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse); + transformOptions.replace("notExtractBookmarksText", "true", "false"); + executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile); + executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile); + + // both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT + verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null, + TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null, + TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding); + + // use full set of pdfbox transformOptions just to be safe + clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse); + transformOptions.put("targetEncoding", "anyEncoding"); + executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile); + executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile); + + // both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT but the encoding will change + verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null, + TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding"); + + verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null, + TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding"); + } +} diff --git a/alfresco-transformer-base/src/main/java/org/alfresco/transformer/util/RequestParamMap.java b/alfresco-transformer-base/src/main/java/org/alfresco/transformer/util/RequestParamMap.java index 2dd44310..544fb31d 100644 --- a/alfresco-transformer-base/src/main/java/org/alfresco/transformer/util/RequestParamMap.java +++ b/alfresco-transformer-base/src/main/java/org/alfresco/transformer/util/RequestParamMap.java @@ -70,6 +70,6 @@ public interface RequestParamMap String COMMAND_OPTIONS = "commandOptions"; String TIMEOUT = "timeout"; String INCLUDE_CONTENTS = "includeContents"; - String NOT_EXTRACT_BOOKMARK_TEXT = "notExtractBookmarksText"; + String NOT_EXTRACT_BOOKMARKS_TEXT = "notExtractBookmarksText"; String PAGE_LIMIT = "pageLimit"; } diff --git a/docs/external-engine-configuration.md b/docs/external-engine-configuration.md index 404fad0d..fa9050ce 100644 --- a/docs/external-engine-configuration.md +++ b/docs/external-engine-configuration.md @@ -13,6 +13,7 @@ The following externalized T-engines properties are available: | ACTIVEMQ_USER | ActiveMQ User. | admin | | ACTIVEMQ_PASSWORD | ActiveMQ Password. | admin | | FILE_STORE_URL | T-Engine Port. | http://localhost:8099/alfresco/api/-default-/private/sfs/versions/1/file | +| PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT | The default behaviour for notExtractBookmarksText when this request param is omitted from a request. | false | | TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for receiving async requests. | org.alfresco.transform.engine.tika.acs | @@ -81,6 +82,7 @@ The following externalized T-engines properties are available: | ACTIVEMQ_USER | ActiveMQ User. | admin | | ACTIVEMQ_PASSWORD | ActiveMQ Password. | admin | | FILE_STORE_URL | T-Engine Port. | http://localhost:8099/alfresco/api/-default-/private/sfs/versions/1/file | +| PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT | The default behaviour for notExtractBookmarksText when this request param is omitted from a request. | false | | TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for async requests. | org.alfresco.transform.engine.aio.acs | | PDFRENDERER_EXE | Path to Pdf-renderer EXE. | /usr/bin/alfresco-pdf-renderer | | TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for async requests. | org.alfresco.transform.engine.libreoffice.acs |