Save point: [skip ci]

* cleaning up TransformController - more to do
* wire up all transforms
This commit is contained in:
alandavis
2022-07-06 17:38:15 +01:00
parent 0eb8d9e142
commit 2e17c3ec53
29 changed files with 862 additions and 512 deletions

View File

@@ -26,9 +26,8 @@
*/
package org.alfresco.transform.tika.metadataExtractors;
import org.alfresco.transform.base.CustomTransformer;
import org.alfresco.transform.base.TransformManager;
import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor;
import org.alfresco.transform.common.TransformException;
import org.apache.tika.embedder.Embedder;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.DublinCore;
@@ -54,7 +53,6 @@ import org.xml.sax.Locator;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
@@ -84,7 +82,7 @@ import java.util.stream.Stream;
* @author Nick Burch
* @author adavis
*/
public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtractor implements CustomTransformer
public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtractor
{
protected static final String KEY_AUTHOR = "author";
protected static final String KEY_TITLE = "title";
@@ -310,22 +308,15 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
return rawProperties;
}
public void embedMetadata(String sourceMimetype, Map<String, String> transformOptions,
String sourceEncoding, InputStream inputStream,
String targetEncoding, OutputStream outputStream) throws Exception
{
// TODO
throw new TransformException(500, "TODO embedMetadata");
}
/**
* @deprecated The content repository's TikaPoweredMetadataExtracter provides no non test implementations.
* This code exists in case there are custom implementations, that need to be converted to T-Engines.
* It is simply a copy and paste from the content repository and has received limited testing.
*/
@Override
public void embedMetadata(String sourceMimetype, String targetMimetype, Map<String, String> transformOptions,
File sourceFile, File targetFile) throws Exception
public void embedMetadata(String sourceMimetype, InputStream inputStream,
String targetMimetype, OutputStream outputStream,
Map<String, String> transformOptions, TransformManager transformManager) throws Exception
{
Embedder embedder = getEmbedder();
if (embedder == null)
@@ -334,12 +325,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
}
Metadata metadataToEmbed = getTikaMetadata(transformOptions);
try (InputStream inputStream = new FileInputStream(sourceFile);
OutputStream outputStream = new FileOutputStream(targetFile))
{
embedder.embed(metadataToEmbed, inputStream, outputStream, null);
}
embedder.embed(metadataToEmbed, inputStream, outputStream, null);
}
private Metadata getTikaMetadata(Map<String, String> transformOptions)

View File

@@ -26,10 +26,9 @@
*/
package org.alfresco.transform.tika.transformers;
import org.alfresco.transform.base.CustomTransformer;
import org.alfresco.transform.base.logging.LogEntry;
import org.alfresco.transform.base.util.CustomTransformerFileAdaptor;
import org.alfresco.transform.common.RequestParamMap;
import org.alfresco.transform.common.TransformException;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.parser.Parser;
import org.slf4j.Logger;
@@ -38,21 +37,18 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Map;
import java.util.StringJoiner;
import static java.lang.Boolean.parseBoolean;
public abstract class GenericTikaTransformer implements CustomTransformer
public abstract class GenericTikaTransformer implements CustomTransformerFileAdaptor
{
private static final Logger logger = LoggerFactory.getLogger(GenericTikaTransformer.class);
@Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}")
boolean notExtractBookmarksTextDefault;
@Autowired
protected Tika tika;
@@ -71,15 +67,7 @@ public abstract class GenericTikaTransformer implements CustomTransformer
}
@Override
public void transform(String sourceMimetype, String sourceEncoding, InputStream inputStream,
String targetMimetype, String targetEncoding, OutputStream outputStream,
Map<String, String> transformOptions) throws Exception
{
// TODO
throw new TransformException(500, "TODO GenericTikaTransformer transform with InputStreams");
}
public void transform(String transformName, String sourceMimetype, String targetMimetype,
public void transform(String sourceMimetype, String targetMimetype,
Map<String, String> transformOptions, File sourceFile, File targetFile)
throws Exception
{
@@ -92,7 +80,8 @@ public abstract class GenericTikaTransformer implements CustomTransformer
{
logger.trace("notExtractBookmarksText default value has been overridden to {}", notExtractBookmarksTextDefault);
}
call(sourceFile, targetFile, transformName,
String transformerName = getTransformerName();
call(sourceFile, targetFile, transformerName,
includeContents ? Tika.INCLUDE_CONTENTS : null,
notExtractBookmarksText ? Tika.NOT_EXTRACT_BOOKMARKS_TEXT : null,
Tika.TARGET_MIMETYPE + targetMimetype, Tika.TARGET_ENCODING + targetEncoding);

View File

@@ -49,7 +49,7 @@ import org.mockito.junit.jupiter.MockitoExtension;
@ExtendWith(MockitoExtension.class)
public class GenericTikaTransformerTest
{
private class TikaTestTransformer extends GenericTikaTransformer
private static class TikaTestTransformer extends GenericTikaTransformer
{
@Override
protected Parser getParser()
@@ -61,7 +61,7 @@ public class GenericTikaTransformerTest
{
this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault;
}
};
}
@Test
public void testNotExtractBookmarkTextDefault() throws Exception
@@ -83,9 +83,9 @@ public class GenericTikaTransformerTest
Map<String, String> transformOptions = new HashMap<>();
// use empty transformOptions to test defaults
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
executorSpyDefaultTrue.transform(sourceMimetype, targetMimetype, transformOptions,
mockSourceFile, mockTargetFile);
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
executorSpyDefaultFalse.transform(sourceMimetype, targetMimetype, transformOptions,
mockSourceFile, mockTargetFile);
// when default set to true, with no options passed we should get a call method with NOT_EXTRACT_BOOKMARKS_TEXT
@@ -99,9 +99,9 @@ public class GenericTikaTransformerTest
// use transforms with notExtractBookmarksText set to true
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
transformOptions.put("notExtractBookmarksText", "true");
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
executorSpyDefaultTrue.transform(sourceMimetype, targetMimetype, transformOptions,
mockSourceFile, mockTargetFile);
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
executorSpyDefaultFalse.transform(sourceMimetype, targetMimetype, transformOptions,
mockSourceFile, mockTargetFile);
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
@@ -114,8 +114,8 @@ public class GenericTikaTransformerTest
// use transforms with notExtractBookmarksText set to false
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
transformOptions.replace("notExtractBookmarksText", "true", "false");
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
executorSpyDefaultTrue.transform(sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
executorSpyDefaultFalse.transform(sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
@@ -124,11 +124,11 @@ public class GenericTikaTransformerTest
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
// use full set of pdfbox transformOptions just to be safe
// useful set of pdfbox transformOptions just to be safe
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
transformOptions.put("targetEncoding", "anyEncoding");
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
executorSpyDefaultTrue.transform(sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
executorSpyDefaultFalse.transform(sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT but the encoding will change
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,