mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-05-12 17:04:48 +00:00
MNT-22082 transformation of pdf to text hang (#367)
A new constructor has been added to the TikaController to provide the new spring config. The creation of the TikaExecutor has been moved to "singleton pattern" as the injection of the @Value happens after the instantiation of the TikaJavaExecutor and does not pass the value correctly. The instantiation is now done once, on the first transform request. Param has been added to the AIO beans.
This commit is contained in:
parent
bc2306d2bb
commit
03d08d0c9e
@ -76,6 +76,9 @@ public class AIOCustomConfig
|
||||
@Value("${transform.core.imagemagick.config}")
|
||||
private String imageMagickConfigPath;
|
||||
|
||||
@Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}")
|
||||
private boolean notExtractBookmarksTextDefault;
|
||||
|
||||
/**
|
||||
*
|
||||
* @return Override the TransformRegistryImpl used in {@link AbstractTransformerController}
|
||||
@ -86,7 +89,7 @@ public class AIOCustomConfig
|
||||
{
|
||||
AIOTransformRegistry aioTransformRegistry = new AIOTransformRegistry();
|
||||
aioTransformRegistry.registerTransformer(new SelectingTransformer());
|
||||
aioTransformRegistry.registerTransformer(new TikaJavaExecutor());
|
||||
aioTransformRegistry.registerTransformer(new TikaJavaExecutor(notExtractBookmarksTextDefault));
|
||||
aioTransformRegistry.registerTransformer(new ImageMagickCommandExecutor(imageMagickExePath, imageMagickDynPath, imageMagickRootPath, imageMagickCodersPath, imageMagickConfigPath));
|
||||
aioTransformRegistry.registerTransformer(new LibreOfficeJavaExecutor(libreofficePath, libreofficeMaxTasksPerProcess, libreofficeTimeout, libreofficePortNumbers, libreofficeTemplateProfileDir, libreofficeIsEnabled));
|
||||
aioTransformRegistry.registerTransformer(new PdfRendererCommandExecutor(pdfRendererPath));
|
||||
|
@ -17,3 +17,6 @@ transform:
|
||||
exe: ${IMAGEMAGICK_EXE:/usr/bin/convert}
|
||||
coders: ${IMAGEMAGICK_CODERS:}
|
||||
config: ${IMAGEMAGICK_CONFIG:}
|
||||
tika:
|
||||
pdfBox:
|
||||
notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false}
|
@ -30,6 +30,7 @@ import org.alfresco.transformer.executors.TikaJavaExecutor;
|
||||
import org.alfresco.transformer.probes.ProbeTestTransform;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Controller;
|
||||
|
||||
import java.io.File;
|
||||
@ -67,7 +68,10 @@ public class TikaController extends AbstractTransformerController
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TikaController.class);
|
||||
|
||||
private TikaJavaExecutor javaExecutor = new TikaJavaExecutor();
|
||||
@Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}")
|
||||
private boolean notExtractBookmarksTextDefault;
|
||||
|
||||
private TikaJavaExecutor javaExecutor;
|
||||
|
||||
@Override
|
||||
public String getTransformerName()
|
||||
@ -102,6 +106,15 @@ public class TikaController extends AbstractTransformerController
|
||||
Map<String, String> transformOptions, File sourceFile, File targetFile)
|
||||
{
|
||||
transformOptions.put(TRANSFORM_NAME_PARAMETER, transformName);
|
||||
javaExecutor.transform(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
|
||||
getJavaExecutor().transform(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
|
||||
}
|
||||
|
||||
private TikaJavaExecutor getJavaExecutor()
|
||||
{
|
||||
if(javaExecutor==null)
|
||||
{
|
||||
javaExecutor = new TikaJavaExecutor(notExtractBookmarksTextDefault);
|
||||
}
|
||||
return javaExecutor;
|
||||
}
|
||||
}
|
||||
|
@ -3,4 +3,7 @@ queue:
|
||||
transform:
|
||||
core:
|
||||
config:
|
||||
location: classpath:tika_engine_config.json
|
||||
location: classpath:tika_engine_config.json
|
||||
tika:
|
||||
pdfBox:
|
||||
notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false}
|
@ -61,7 +61,7 @@ import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_XHTML;
|
||||
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_XML;
|
||||
import static org.alfresco.transformer.util.MimetypeMap.MIMETYPE_ZIP;
|
||||
import static org.alfresco.transformer.util.RequestParamMap.INCLUDE_CONTENTS;
|
||||
import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK_TEXT;
|
||||
import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
@ -585,7 +585,7 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
|
||||
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
|
||||
mockMvc.perform(
|
||||
mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension).param(
|
||||
NOT_EXTRACT_BOOKMARK_TEXT, "true"))
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, "true"))
|
||||
.andExpect(status().is(OK.value()))
|
||||
.andExpect(header().string("Content-Disposition",
|
||||
"attachment; filename*= UTF-8''quick." + targetExtension));
|
||||
|
@ -104,6 +104,17 @@
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox-tools</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-junit-jupiter</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
@ -40,6 +40,7 @@ import org.alfresco.transformer.metadataExtractors.TikaAudioMetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.TikaAutoMetadataExtractor;
|
||||
import org.alfresco.transformer.util.RequestParamMap;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import java.io.File;
|
||||
@ -53,7 +54,7 @@ import static org.alfresco.transformer.executors.Tika.INCLUDE_CONTENTS;
|
||||
import static org.alfresco.transformer.executors.Tika.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING;
|
||||
import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE;
|
||||
import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK_TEXT;
|
||||
import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
|
||||
/**
|
||||
* JavaExecutor implementation for running TIKA transformations. It loads the
|
||||
@ -61,6 +62,8 @@ import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARK
|
||||
*/
|
||||
public class TikaJavaExecutor implements JavaExecutor
|
||||
{
|
||||
private boolean notExtractBookmarksTextDefault;
|
||||
|
||||
private static final String ID = "tika";
|
||||
|
||||
public static final String LICENCE = "This transformer uses Tika from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt";
|
||||
@ -83,8 +86,9 @@ public class TikaJavaExecutor implements JavaExecutor
|
||||
.put("SamplePoiMetadataEmbedder", new PoiMetadataExtractor())
|
||||
.build();
|
||||
|
||||
public TikaJavaExecutor()
|
||||
public TikaJavaExecutor(boolean notExtractBookmarksTextDefault)
|
||||
{
|
||||
this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault;
|
||||
try
|
||||
{
|
||||
tika = new Tika();
|
||||
@ -95,6 +99,11 @@ public class TikaJavaExecutor implements JavaExecutor
|
||||
}
|
||||
}
|
||||
|
||||
public TikaJavaExecutor()
|
||||
{
|
||||
this(false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTransformerId()
|
||||
{
|
||||
@ -109,12 +118,17 @@ public class TikaJavaExecutor implements JavaExecutor
|
||||
final boolean includeContents = parseBoolean(
|
||||
transformOptions.getOrDefault(RequestParamMap.INCLUDE_CONTENTS, "false"));
|
||||
final boolean notExtractBookmarksText = parseBoolean(
|
||||
transformOptions.getOrDefault(NOT_EXTRACT_BOOKMARK_TEXT, "false"));
|
||||
transformOptions.getOrDefault(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT, String.valueOf(notExtractBookmarksTextDefault)));
|
||||
final String targetEncoding = transformOptions.getOrDefault("targetEncoding", "UTF-8");
|
||||
|
||||
if(transformOptions.get(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT)==null && notExtractBookmarksTextDefault)
|
||||
{
|
||||
LoggerFactory.getLogger(TikaJavaExecutor.class).trace(
|
||||
"notExtractBookmarksText default value has been overridden to {}",
|
||||
notExtractBookmarksTextDefault);
|
||||
}
|
||||
call(sourceFile, targetFile, transformName,
|
||||
includeContents ? INCLUDE_CONTENTS : null,
|
||||
notExtractBookmarksText ? NOT_EXTRACT_BOOKMARKS_TEXT : null,
|
||||
notExtractBookmarksText ? Tika.NOT_EXTRACT_BOOKMARKS_TEXT : null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + targetEncoding);
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,126 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.executors;
|
||||
|
||||
import static org.alfresco.transformer.executors.Tika.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING;
|
||||
import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.clearInvocations;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.spy;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class TikaJavaExecutorTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testNotExtractBookmarkTextDefault() throws Exception
|
||||
{
|
||||
TikaJavaExecutor executorSpyDefaultTrue = spy(new TikaJavaExecutor(true));
|
||||
TikaJavaExecutor executorSpyDefaultFalse = spy(new TikaJavaExecutor(false));
|
||||
|
||||
File mockSourceFile = mock(File.class);
|
||||
File mockTargetFile = mock(File.class);
|
||||
String transformName = "transformName";
|
||||
String sourceMimetype = "sourceMimetype";
|
||||
String targetMimetype = "targetMimetype";
|
||||
String defaultEncoding = "UTF-8";
|
||||
|
||||
// no need to continue execution passed here or check values as we're checking the correct params passed to this method later.
|
||||
lenient().doNothing().when(executorSpyDefaultTrue).call(any(), any(), any(), any(), any(), any(), any());
|
||||
lenient().doNothing().when(executorSpyDefaultFalse).call(any(), any(), any(), any(), any(), any(), any());
|
||||
|
||||
Map<String, String> transformOptions = new HashMap<String,String>();
|
||||
|
||||
// use empty transformOptions to test defaults
|
||||
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
|
||||
mockSourceFile, mockTargetFile);
|
||||
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
|
||||
mockSourceFile, mockTargetFile);
|
||||
|
||||
// when default set to true, with no options passed we should get a call method with NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// when default set to false, with no options passed we should get a call method without NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// use transforms with notExtractBookmarksText set to true
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.put("notExtractBookmarksText", "true");
|
||||
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
|
||||
mockSourceFile, mockTargetFile);
|
||||
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
|
||||
mockSourceFile, mockTargetFile);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// use transforms with notExtractBookmarksText set to false
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.replace("notExtractBookmarksText", "true", "false");
|
||||
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
|
||||
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// use full set of pdfbox transformOptions just to be safe
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.put("targetEncoding", "anyEncoding");
|
||||
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
|
||||
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT but the encoding will change
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding");
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding");
|
||||
}
|
||||
}
|
@ -70,6 +70,6 @@ public interface RequestParamMap
|
||||
String COMMAND_OPTIONS = "commandOptions";
|
||||
String TIMEOUT = "timeout";
|
||||
String INCLUDE_CONTENTS = "includeContents";
|
||||
String NOT_EXTRACT_BOOKMARK_TEXT = "notExtractBookmarksText";
|
||||
String NOT_EXTRACT_BOOKMARKS_TEXT = "notExtractBookmarksText";
|
||||
String PAGE_LIMIT = "pageLimit";
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ The following externalized T-engines properties are available:
|
||||
| ACTIVEMQ_USER | ActiveMQ User. | admin |
|
||||
| ACTIVEMQ_PASSWORD | ActiveMQ Password. | admin |
|
||||
| FILE_STORE_URL | T-Engine Port. | http://localhost:8099/alfresco/api/-default-/private/sfs/versions/1/file |
|
||||
| PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT | The default behaviour for notExtractBookmarksText when this request param is omitted from a request. | false |
|
||||
| TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for receiving async requests. | org.alfresco.transform.engine.tika.acs |
|
||||
|
||||
|
||||
@ -81,6 +82,7 @@ The following externalized T-engines properties are available:
|
||||
| ACTIVEMQ_USER | ActiveMQ User. | admin |
|
||||
| ACTIVEMQ_PASSWORD | ActiveMQ Password. | admin |
|
||||
| FILE_STORE_URL | T-Engine Port. | http://localhost:8099/alfresco/api/-default-/private/sfs/versions/1/file |
|
||||
| PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT | The default behaviour for notExtractBookmarksText when this request param is omitted from a request. | false |
|
||||
| TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for async requests. | org.alfresco.transform.engine.aio.acs |
|
||||
| PDFRENDERER_EXE | Path to Pdf-renderer EXE. | /usr/bin/alfresco-pdf-renderer |
|
||||
| TRANSFORM_ENGINE_REQUEST_QUEUE | T-Engine queue used for async requests. | org.alfresco.transform.engine.libreoffice.acs |
|
||||
|
Loading…
x
Reference in New Issue
Block a user