MNT-23047: exifTool command defined as properties and dependant of ru… (#655)

* MNT-23047: exifTool command defined as properties and dependant of running OS
This commit is contained in:
Vítor Moreira 2022-10-11 11:06:33 +01:00 committed by GitHub
parent 514d03f81a
commit 6849854f4e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 60 additions and 8 deletions

View File

@ -20,4 +20,7 @@ transform:
config: ${IMAGEMAGICK_CONFIG:}
tika:
pdfBox:
notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false}
notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false}
exifTool:
windowsOS: 'exiftool -args -G1 -sep "|||" #{"$"}{INPUT}'
unixOS: 'env FOO=#{"$"}{OUTPUT} exiftool -args -G1 -sep "|||" #{"$"}{INPUT}'

View File

@ -26,6 +26,7 @@
*/
package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.base.executors.RuntimeExec;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.alfresco.transform.tika.parsers.ExifToolParser;
import org.apache.commons.lang3.StringUtils;
@ -55,17 +56,18 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
private static final Pattern YEAR_IPTC = Pattern.compile("(\\d{4}[:|-]\\d{2}[:|-]\\d{2})");
private ExifToolParser parser;
private RuntimeExec exifRuntimeExec;
public IPTCMetadataExtractor()
{
public IPTCMetadataExtractor(RuntimeExec exifRuntimeExec) {
super(EXTRACTOR, logger);
this.exifRuntimeExec = exifRuntimeExec;
}
@Override
protected Parser getParser()
{
if (this.parser == null) {
this.parser = new ExifToolParser();
this.parser = new ExifToolParser(exifRuntimeExec);
}
return this.parser;
}

View File

@ -44,6 +44,7 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.alfresco.transform.base.executors.RuntimeExec;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.NullOutputStream;
import org.apache.tika.exception.TikaException;
@ -76,13 +77,33 @@ public class ExifToolParser extends ExternalParser {
private String separator;
public ExifToolParser() {
this(null);
}
public ExifToolParser(RuntimeExec exifRuntimeExec) {
super();
try {
List<ExternalParser> eParsers = ExternalParsersFactory.create(getExternalParserConfigURL());
// if ExifTool is not installed then no parsers are returned
if (eParsers.size() > 0) {
ExternalParser eParser = eParsers.get(0);
this.setCommand(eParser.getCommand());
String[] commandToBeExecuted;
if (exifRuntimeExec==null) {
logger.debug("Command to be executed determined from Tika ExternalParser");
commandToBeExecuted = eParser.getCommand();
} else {
logger.debug("Command to be executed determined from RuntimeExec");
commandToBeExecuted = exifRuntimeExec.getCommand();
}
if (commandToBeExecuted==null || commandToBeExecuted.length==0) {
commandToBeExecuted = eParser.getCommand();
}
String commandToBeExecutedAsString = String.join( " ", commandToBeExecuted);
logger.debug("Command to be executed: " + commandToBeExecutedAsString );
this.setCommand(commandToBeExecutedAsString);
this.setIgnoredLineConsumer(eParser.getIgnoredLineConsumer());
this.setMetadataExtractionPatterns(eParser.getMetadataExtractionPatterns());
this.setSupportedTypes(eParser.getSupportedTypes());
@ -153,9 +174,11 @@ public class ExifToolParser extends ExternalParser {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
if (this.getSupportedTypes().contains(mediaType)) {
parse(tis, xhtml, metadata, tmp);
}
}
switch (mediaType.getType()+"/"+mediaType.getSubtype()) {
case MIMETYPE_IMAGE_JPEG:
parseAdditional(new JpegParser(), tis, handler, metadata, context, mediaType);
@ -299,6 +322,7 @@ public class ExifToolParser extends ExternalParser {
try {
IOUtils.copy(stream, stdin);
} catch (IOException e) {
logger.error( e.getMessage());
}
}
};
@ -306,6 +330,7 @@ public class ExifToolParser extends ExternalParser {
try {
t.join();
} catch (InterruptedException ignore) {
logger.error(ignore.getMessage());
}
}

View File

@ -28,6 +28,7 @@ package org.alfresco.transform.tika.transformers;
import org.alfresco.transform.base.CustomTransformer;
import org.alfresco.transform.base.TransformManager;
import org.alfresco.transform.base.executors.RuntimeExec;
import org.alfresco.transform.base.logging.LogEntry;
import org.alfresco.transform.common.RequestParamMap;
import org.apache.tika.extractor.DocumentSelector;
@ -36,10 +37,12 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.StringJoiner;
@ -52,11 +55,27 @@ public abstract class AbstractTikaTransformer implements CustomTransformer
@Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}")
boolean notExtractBookmarksTextDefault;
@Value("${transform.core.tika.exifTool.windowsOS}")
private String exifToolCommandOnWindows;
@Value("${transform.core.tika.exifTool.unixOS}")
private String exifToolCommandOnUnix;
@Autowired
protected Tika tika;
protected abstract Parser getParser();
@Bean("exifTool")
public RuntimeExec exifRuntimeExec()
{
RuntimeExec runtimeExec = new RuntimeExec();
Map<String, String[]> commandPerOS = new HashMap<>();
commandPerOS.put("[wW]in.*", exifToolCommandOnWindows.split(" "));
commandPerOS.put("*", exifToolCommandOnUnix.split(" "));
runtimeExec.setCommandsAndArguments(commandPerOS);
return runtimeExec;
}
protected DocumentSelector getDocumentSelector()
{
return null;

View File

@ -4,4 +4,7 @@ transform:
core:
tika:
pdfBox:
notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false}
notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false}
exifTool:
windowsOS: 'exiftool -args -G1 -sep "|||" #{"$"}{INPUT}'
unixOS: 'env FOO=#{"$"}{OUTPUT} exiftool -args -G1 -sep "|||" #{"$"}{INPUT}'

View File

@ -32,7 +32,7 @@ import static org.junit.jupiter.api.Assertions.assertArrayEquals;
public class IPTCMetadataExtractorTest
{
IPTCMetadataExtractor extractor = new IPTCMetadataExtractor();
IPTCMetadataExtractor extractor = new IPTCMetadataExtractor(null);
@Test
public void testIptcToIso8601DateStrings() {