MNT-22082 transformation of pdf to text hang (#367)

A new constructor has been added to the TikaController to provide
the new spring config.
The creation of the TikaExecutor has been moved to "singleton pattern" as
the injection of the @Value happens after the instantiation of the
TikaJavaExecutor and does not pass the value correctly. The
instantiation is now done once, on the first transform request.
Param has been added to the AIO beans.
This commit is contained in:
David Edwards 2021-04-13 09:59:42 +01:00 committed by GitHub
parent bc2306d2bb
commit 03d08d0c9e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 187 additions and 12 deletions

View File

@ -76,6 +76,9 @@ public class AIOCustomConfig
@Value("${transform.core.imagemagick.config}")
private String imageMagickConfigPath;
@Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}")
private boolean notExtractBookmarksTextDefault;
/**
*
* @return Override the TransformRegistryImpl used in {@link AbstractTransformerController}
@ -86,7 +89,7 @@ public class AIOCustomConfig
{
AIOTransformRegistry aioTransformRegistry = new AIOTransformRegistry();
aioTransformRegistry.registerTransformer(new SelectingTransformer());
aioTransformRegistry.registerTransformer(new TikaJavaExecutor());
aioTransformRegistry.registerTransformer(new TikaJavaExecutor(notExtractBookmarksTextDefault));
aioTransformRegistry.registerTransformer(new ImageMagickCommandExecutor(imageMagickExePath, imageMagickDynPath, imageMagickRootPath, imageMagickCodersPath, imageMagickConfigPath));
aioTransformRegistry.registerTransformer(new LibreOfficeJavaExecutor(libreofficePath, libreofficeMaxTasksPerProcess, libreofficeTimeout, libreofficePortNumbers, libreofficeTemplateProfileDir, libreofficeIsEnabled));
aioTransformRegistry.registerTransformer(new PdfRendererCommandExecutor(pdfRendererPath));

View File

@ -17,3 +17,6 @@ transform:
exe: ${IMAGEMAGICK_EXE:/usr/bin/convert}
coders: ${IMAGEMAGICK_CODERS:}
config: ${IMAGEMAGICK_CONFIG:}
tika:
pdfBox:
notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false}

View File

@ -30,6 +30,7 @@ import org.alfresco.transformer.executors.TikaJavaExecutor;
import org.alfresco.transformer.probes.ProbeTestTransform;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Controller;
import java.io.File;
@ -67,7 +68,10 @@ public class TikaController extends AbstractTransformerController
{
private static final Logger logger = LoggerFactory.getLogger(TikaController.class);
private TikaJavaExecutor javaExecutor = new TikaJavaExecutor();
@Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}")
private boolean notExtractBookmarksTextDefault;
private TikaJavaExecutor javaExecutor;
@Override
public String getTransformerName()
@ -102,6 +106,15 @@ public class TikaController extends AbstractTransformerController
Map<String, String> transformOptions, File sourceFile, File targetFile)
{
transformOptions.put(TRANSFORM_NAME_PARAMETER, transformName);
javaExecutor.transform(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
getJavaExecutor().transform(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
}
private TikaJavaExecutor getJavaExecutor()
{
if(javaExecutor==null)
{
javaExecutor = new TikaJavaExecutor(notExtractBookmarksTextDefault);
}
return javaExecutor;
}
}

View File

@ -3,4 +3,7 @@ queue:
transform:
core:
config:
location: classpath:tika_engine_config.json
location: classpath:tika_engine_config.json
tika:
pdfBox:
notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false}

View File

@ -61,7 +61,7 @@ import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_XHTML;
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_XML;
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_ZIP;
import static org.alfresco.transformer.util.RequestParamMap.INCLUDE_CONTENTS;
import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK_TEXT;
import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
@ -585,7 +585,7 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
mockMvc.perform(
mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension).param(
NOT_EXTRACT_BOOKMARK_TEXT, "true"))
NOT_EXTRACT_BOOKMARKS_TEXT, "true"))
.andExpect(status().is(OK.value()))
.andExpect(header().string("Content-Disposition",
"attachment; filename*= UTF-8''quick." + targetExtension));

View File

@ -104,6 +104,17 @@
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>

View File

@ -40,6 +40,7 @@ import org.alfresco.transformer.metadataExtractors.TikaAudioMetadataExtractor;
import org.alfresco.transformer.metadataExtractors.TikaAutoMetadataExtractor;
import org.alfresco.transformer.util.RequestParamMap;
import org.apache.tika.exception.TikaException;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import java.io.File;
@ -53,7 +54,7 @@ import static org.alfresco.transformer.executors.Tika.INCLUDE_CONTENTS;
import static org.alfresco.transformer.executors.Tika.NOT_EXTRACT_BOOKMARKS_TEXT;
import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING;
import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE;
import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK_TEXT;
import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT;
/**
* JavaExecutor implementation for running TIKA transformations. It loads the
@ -61,6 +62,8 @@ import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK
*/
public class TikaJavaExecutor implements JavaExecutor
{
private boolean notExtractBookmarksTextDefault;
private static final String ID = "tika";
public static final String LICENCE = "This transformer uses Tika from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt";
@ -83,8 +86,9 @@ public class TikaJavaExecutor implements JavaExecutor
.put("SamplePoiMetadataEmbedder", new PoiMetadataExtractor())
.build();
public TikaJavaExecutor()
public TikaJavaExecutor(boolean notExtractBookmarksTextDefault)
{
this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault;
try
{
tika = new Tika();
@ -95,6 +99,11 @@ public class TikaJavaExecutor implements JavaExecutor
}
}
public TikaJavaExecutor()
{
this(false);
}
@Override
public String getTransformerId()
{
@ -109,12 +118,17 @@ public class TikaJavaExecutor implements JavaExecutor
final boolean includeContents = parseBoolean(
transformOptions.getOrDefault(RequestParamMap.INCLUDE_CONTENTS, "false"));
final boolean notExtractBookmarksText = parseBoolean(
transformOptions.getOrDefault(NOT_EXTRACT_BOOKMARK_TEXT, "false"));
transformOptions.getOrDefault(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT, String.valueOf(notExtractBookmarksTextDefault)));
final String targetEncoding = transformOptions.getOrDefault("targetEncoding", "UTF-8");
if(transformOptions.get(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT)==null && notExtractBookmarksTextDefault)
{
LoggerFactory.getLogger(TikaJavaExecutor.class).trace(
"notExtractBookmarksText default value has been overridden to {}",
notExtractBookmarksTextDefault);
}
call(sourceFile, targetFile, transformName,
includeContents ? INCLUDE_CONTENTS : null,
notExtractBookmarksText ? NOT_EXTRACT_BOOKMARKS_TEXT : null,
notExtractBookmarksText ? Tika.NOT_EXTRACT_BOOKMARKS_TEXT : null,
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + targetEncoding);
}

View File

@ -0,0 +1,126 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.executors;
import static org.alfresco.transformer.executors.Tika.NOT_EXTRACT_BOOKMARKS_TEXT;
import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING;
import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.clearInvocations;
import static org.mockito.Mockito.lenient;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.junit.jupiter.MockitoExtension;
@ExtendWith(MockitoExtension.class)
public class TikaJavaExecutorTest {
@Test
public void testNotExtractBookmarkTextDefault() throws Exception
{
TikaJavaExecutor executorSpyDefaultTrue = spy(new TikaJavaExecutor(true));
TikaJavaExecutor executorSpyDefaultFalse = spy(new TikaJavaExecutor(false));
File mockSourceFile = mock(File.class);
File mockTargetFile = mock(File.class);
String transformName = "transformName";
String sourceMimetype = "sourceMimetype";
String targetMimetype = "targetMimetype";
String defaultEncoding = "UTF-8";
// no need to continue execution passed here or check values as we're checking the correct params passed to this method later.
lenient().doNothing().when(executorSpyDefaultTrue).call(any(), any(), any(), any(), any(), any(), any());
lenient().doNothing().when(executorSpyDefaultFalse).call(any(), any(), any(), any(), any(), any(), any());
Map<String, String> transformOptions = new HashMap<String,String>();
// use empty transformOptions to test defaults
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
mockSourceFile, mockTargetFile);
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
mockSourceFile, mockTargetFile);
// when default set to true, with no options passed we should get a call method with NOT_EXTRACT_BOOKMARKS_TEXT
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null,
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
// when default set to false, with no options passed we should get a call method without NOT_EXTRACT_BOOKMARKS_TEXT
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
// use transforms with notExtractBookmarksText set to true
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
transformOptions.put("notExtractBookmarksText", "true");
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
mockSourceFile, mockTargetFile);
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
mockSourceFile, mockTargetFile);
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null,
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null,
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
// use transforms with notExtractBookmarksText set to false
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
transformOptions.replace("notExtractBookmarksText", "true", "false");
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
// use full set of pdfbox transformOptions just to be safe
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
transformOptions.put("targetEncoding", "anyEncoding");
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT but the encoding will change
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding");
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding");
}
}

View File

@ -70,6 +70,6 @@ public interface RequestParamMap
String COMMAND_OPTIONS = "commandOptions";
String TIMEOUT = "timeout";
String INCLUDE_CONTENTS = "includeContents";
String NOT_EXTRACT_BOOKMARK_TEXT = "notExtractBookmarksText";
String NOT_EXTRACT_BOOKMARKS_TEXT = "notExtractBookmarksText";
String PAGE_LIMIT = "pageLimit";
}

View File

@ -13,6 +13,7 @@ The following externalized T-engines properties are available:
| ACTIVEMQ_USER | ActiveMQ User. | admin |
| ACTIVEMQ_PASSWORD | ActiveMQ Password. | admin |
| FILE_STORE_URL | T-Engine Port. | http://localhost:8099/alfresco/api/-default-/private/sfs/versions/1/file |
| PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT | The default behaviour for notExtractBookmarksText when this request param is omitted from a request. | false |
| TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for receiving async requests. | org.alfresco.transform.engine.tika.acs |
@ -81,6 +82,7 @@ The following externalized T-engines properties are available:
| ACTIVEMQ_USER | ActiveMQ User. | admin |
| ACTIVEMQ_PASSWORD | ActiveMQ Password. | admin |
| FILE_STORE_URL | T-Engine Port. | http://localhost:8099/alfresco/api/-default-/private/sfs/versions/1/file |
| PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT | The default behaviour for notExtractBookmarksText when this request param is omitted from a request. | false |
| TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for async requests. | org.alfresco.transform.engine.aio.acs |
| PDFRENDERER_EXE | Path to Pdf-renderer EXE. | /usr/bin/alfresco-pdf-renderer |
| TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for async requests. | org.alfresco.transform.engine.libreoffice.acs |