mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-10-01 14:41:17 +00:00
ACS-9835-Improve code quality in alfresco-transform-core (#1116)
This commit is contained in:
@@ -26,19 +26,20 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import org.alfresco.transform.base.TransformEngine;
|
||||
import org.alfresco.transform.base.probes.ProbeTransform;
|
||||
import org.alfresco.transform.config.reader.TransformConfigResourceReader;
|
||||
import org.alfresco.transform.config.TransformConfig;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
import static org.alfresco.transform.base.logging.StandardMessages.COMMUNITY_LICENCE;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_PDF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import org.alfresco.transform.base.TransformEngine;
|
||||
import org.alfresco.transform.base.probes.ProbeTransform;
|
||||
import org.alfresco.transform.config.TransformConfig;
|
||||
import org.alfresco.transform.config.reader.TransformConfigResourceReader;
|
||||
|
||||
@Component
|
||||
public class TikaTransformEngine implements TransformEngine
|
||||
{
|
||||
|
@@ -26,8 +26,20 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata;
|
||||
|
||||
import org.alfresco.transform.base.TransformManager;
|
||||
import org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.tika.embedder.Embedder;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.DublinCore;
|
||||
@@ -51,24 +63,12 @@ import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.Locator;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.alfresco.transform.base.TransformManager;
|
||||
import org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder;
|
||||
|
||||
/**
|
||||
* The parent of all Metadata Extractors which use Apache Tika under the hood. This handles all the
|
||||
* common parts of processing the files, and the common mappings.
|
||||
|
||||
* The parent of all Metadata Extractors which use Apache Tika under the hood. This handles all the common parts of processing the files, and the common mappings.
|
||||
*
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
@@ -117,8 +117,7 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
}
|
||||
|
||||
/**
|
||||
* Version which also tries the ISO-8601 formats (in order..),
|
||||
* and similar formats, which Tika makes use of
|
||||
* Version which also tries the ISO-8601 formats (in order..), and similar formats, which Tika makes use of
|
||||
*/
|
||||
protected Serializable makeDate(String dateStr)
|
||||
{
|
||||
@@ -127,25 +126,29 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
{
|
||||
return this.tikaUTCDateFormater.parseDateTime(dateStr).toDate();
|
||||
}
|
||||
catch (IllegalArgumentException ignore) {}
|
||||
catch (IllegalArgumentException ignore)
|
||||
{}
|
||||
|
||||
try
|
||||
{
|
||||
return this.tikaUTCDateFormater.withLocale(Locale.US).parseDateTime(dateStr).toDate();
|
||||
}
|
||||
catch (IllegalArgumentException ignore) {}
|
||||
catch (IllegalArgumentException ignore)
|
||||
{}
|
||||
|
||||
try
|
||||
{
|
||||
return this.tikaDateFormater.parseDateTime(dateStr).toDate();
|
||||
}
|
||||
catch (IllegalArgumentException ignore) {}
|
||||
catch (IllegalArgumentException ignore)
|
||||
{}
|
||||
|
||||
try
|
||||
{
|
||||
return this.tikaDateFormater.withLocale(Locale.US).parseDateTime(dateStr).toDate();
|
||||
}
|
||||
catch (IllegalArgumentException ignore) {}
|
||||
catch (IllegalArgumentException ignore)
|
||||
{}
|
||||
|
||||
// Fall back to the normal ones: We just return the String as AbstractMappingMetadataExtracter
|
||||
// convertSystemPropertyValues in the repo will do the conversion that was previously done here.
|
||||
@@ -155,8 +158,7 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
protected abstract Parser getParser();
|
||||
|
||||
/**
|
||||
* Returns the Tika Embedder to modify
|
||||
* the document.
|
||||
* Returns the Tika Embedder to modify the document.
|
||||
*
|
||||
* @return the Tika embedder
|
||||
*/
|
||||
@@ -166,8 +168,7 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
}
|
||||
|
||||
/**
|
||||
* Do we care about the contents of the
|
||||
* extracted header, or nothing at all?
|
||||
* Do we care about the contents of the extracted header, or nothing at all?
|
||||
*/
|
||||
protected boolean needHeaderContents()
|
||||
{
|
||||
@@ -178,14 +179,13 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
* Allows implementation specific mappings to be done.
|
||||
*/
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
Map<String, Serializable> properties, Map<String, String> headers)
|
||||
{
|
||||
return properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the document selector, used for determining whether to parse embedded resources,
|
||||
* null by default so parse all.
|
||||
* Gets the document selector, used for determining whether to parse embedded resources, null by default so parse all.
|
||||
*/
|
||||
protected DocumentSelector getDocumentSelector(Metadata metadata, String targetMimeType)
|
||||
{
|
||||
@@ -221,11 +221,10 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
ParseContext context = buildParseContext(metadata, sourceMimetype);
|
||||
|
||||
ContentHandler handler;
|
||||
Map<String,String> headers = null;
|
||||
Map<String, String> headers = null;
|
||||
if (needHeaderContents())
|
||||
{
|
||||
MapCaptureContentHandler headerCapture =
|
||||
new MapCaptureContentHandler();
|
||||
MapCaptureContentHandler headerCapture = new MapCaptureContentHandler();
|
||||
headers = headerCapture.tags;
|
||||
handler = new HeadContentHandler(headerCapture);
|
||||
}
|
||||
@@ -238,7 +237,7 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
|
||||
// First up, copy all the Tika metadata over
|
||||
// This allows people to map any of the Tika
|
||||
// keys onto their own content model
|
||||
// keys onto their own content model
|
||||
for (String tikaKey : metadata.names())
|
||||
{
|
||||
// TODO review this change (part of MNT-15267) - should we really force string concatenation here !?
|
||||
@@ -246,9 +245,9 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
}
|
||||
|
||||
// Now, map the common Tika metadata keys onto
|
||||
// the common Alfresco metadata keys. This allows
|
||||
// existing mapping properties files to continue
|
||||
// to work without needing any changes
|
||||
// the common Alfresco metadata keys. This allows
|
||||
// existing mapping properties files to continue
|
||||
// to work without needing any changes
|
||||
|
||||
// The simple ones
|
||||
putRawValue(KEY_AUTHOR, getMetadataValue(metadata, TikaCoreProperties.CREATOR), rawProperties);
|
||||
@@ -259,7 +258,7 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
putRawValue(KEY_TAGS, getMetadataValues(metadata, KEY_TAGS), rawProperties);
|
||||
|
||||
// Get the subject and description, despite things not
|
||||
// being nearly as consistent as one might hope
|
||||
// being nearly as consistent as one might hope
|
||||
String subject = getMetadataValue(metadata, TikaCoreProperties.SUBJECT);
|
||||
String description = getMetadataValue(metadata, TikaCoreProperties.DESCRIPTION);
|
||||
if (subject != null && description != null)
|
||||
@@ -289,19 +288,17 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
}
|
||||
|
||||
// If people created a specific instance
|
||||
// (eg OfficeMetadataExtractor), then allow that
|
||||
// instance to map the Tika keys onto its
|
||||
// existing namespace so that older properties
|
||||
// files continue to map correctly
|
||||
// (eg OfficeMetadataExtractor), then allow that
|
||||
// instance to map the Tika keys onto its
|
||||
// existing namespace so that older properties
|
||||
// files continue to map correctly
|
||||
rawProperties = extractSpecific(metadata, rawProperties, headers);
|
||||
|
||||
return rawProperties;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated The content repository's TikaPoweredMetadataExtracter provides no non test implementations.
|
||||
* This code exists in case there are custom implementations, that need to be converted to T-Engines.
|
||||
* It is simply a copy and paste from the content repository and has received limited testing.
|
||||
* @deprecated The content repository's TikaPoweredMetadataExtracter provides no non test implementations. This code exists in case there are custom implementations, that need to be converted to T-Engines. It is simply a copy and paste from the content repository and has received limited testing.
|
||||
*/
|
||||
@Override
|
||||
public void embedMetadata(String sourceMimetype, InputStream inputStream,
|
||||
@@ -335,7 +332,7 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
{
|
||||
try
|
||||
{
|
||||
metadataToEmbed.add(metadataKey, (String)singleValue);
|
||||
metadataToEmbed.add(metadataKey, (String) singleValue);
|
||||
}
|
||||
catch (ClassCastException e)
|
||||
{
|
||||
@@ -347,7 +344,7 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
{
|
||||
try
|
||||
{
|
||||
metadataToEmbed.add(metadataKey, (String)value);
|
||||
metadataToEmbed.add(metadataKey, (String) value);
|
||||
}
|
||||
catch (ClassCastException e)
|
||||
{
|
||||
@@ -395,47 +392,41 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
protected static Stream<String> distinct(final String[] strings)
|
||||
{
|
||||
return Stream.of(strings)
|
||||
.filter(Objects::nonNull)
|
||||
.map(String::strip)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.distinct();
|
||||
.filter(Objects::nonNull)
|
||||
.map(String::strip)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.distinct();
|
||||
}
|
||||
|
||||
/**
|
||||
* This content handler will capture entries from within
|
||||
* the header of the Tika content XHTML, but ignore the
|
||||
* rest.
|
||||
* This content handler will capture entries from within the header of the Tika content XHTML, but ignore the rest.
|
||||
*/
|
||||
protected static class HeadContentHandler extends ContentHandlerDecorator
|
||||
{
|
||||
/**
|
||||
* XHTML XPath parser.
|
||||
*/
|
||||
private static final XPathParser PARSER =
|
||||
new XPathParser("xhtml", XHTMLContentHandler.XHTML);
|
||||
private static final XPathParser PARSER = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
|
||||
|
||||
/**
|
||||
* The XPath matcher used to select the XHTML body contents.
|
||||
*/
|
||||
private static final Matcher MATCHER =
|
||||
PARSER.parse("/xhtml:html/xhtml:head/descendant:node()");
|
||||
private static final Matcher MATCHER = PARSER.parse("/xhtml:html/xhtml:head/descendant:node()");
|
||||
|
||||
/**
|
||||
* Creates a content handler that passes all XHTML body events to the
|
||||
* given underlying content handler.
|
||||
* Creates a content handler that passes all XHTML body events to the given underlying content handler.
|
||||
*
|
||||
* @param handler content handler
|
||||
* @param handler
|
||||
* content handler
|
||||
*/
|
||||
protected HeadContentHandler(ContentHandler handler)
|
||||
{
|
||||
super(new MatchingContentHandler(handler, MATCHER));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This content handler will grab all tags and attributes,
|
||||
* and record the textual content of the last seen one
|
||||
* of them.
|
||||
* Normally only used with {@link HeadContentHandler}
|
||||
* This content handler will grab all tags and attributes, and record the textual content of the last seen one of them. Normally only used with {@link HeadContentHandler}
|
||||
*/
|
||||
protected static class MapCaptureContentHandler implements ContentHandler
|
||||
{
|
||||
@@ -461,41 +452,75 @@ public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMeta
|
||||
|
||||
public void startElement(String namespace, String localname, String qname, Attributes attrs)
|
||||
{
|
||||
for(int i=0; i<attrs.getLength(); i++)
|
||||
for (int i = 0; i < attrs.getLength(); i++)
|
||||
{
|
||||
tags.put(attrs.getQName(i), attrs.getValue(i));
|
||||
}
|
||||
text = new StringBuffer();
|
||||
}
|
||||
|
||||
public void endDocument() {}
|
||||
public void endPrefixMapping(String paramString) {}
|
||||
public void ignorableWhitespace(char[] paramArrayOfChar, int paramInt1, int paramInt2) {}
|
||||
public void processingInstruction(String paramString1, String paramString2) {}
|
||||
public void setDocumentLocator(Locator paramLocator) {}
|
||||
public void skippedEntity(String paramString) {}
|
||||
public void startDocument() {}
|
||||
public void startPrefixMapping(String paramString1, String paramString2) {}
|
||||
public void endDocument()
|
||||
{}
|
||||
|
||||
public void endPrefixMapping(String paramString)
|
||||
{}
|
||||
|
||||
public void ignorableWhitespace(char[] paramArrayOfChar, int paramInt1, int paramInt2)
|
||||
{}
|
||||
|
||||
public void processingInstruction(String paramString1, String paramString2)
|
||||
{}
|
||||
|
||||
public void setDocumentLocator(Locator paramLocator)
|
||||
{}
|
||||
|
||||
public void skippedEntity(String paramString)
|
||||
{}
|
||||
|
||||
public void startDocument()
|
||||
{}
|
||||
|
||||
public void startPrefixMapping(String paramString1, String paramString2)
|
||||
{}
|
||||
}
|
||||
|
||||
/**
|
||||
* A content handler that ignores all the content it finds.
|
||||
* Normally used when we only want the metadata, and don't
|
||||
* care about the file contents.
|
||||
* A content handler that ignores all the content it finds. Normally used when we only want the metadata, and don't care about the file contents.
|
||||
*/
|
||||
protected static class NullContentHandler implements ContentHandler
|
||||
{
|
||||
public void characters(char[] paramArrayOfChar, int paramInt1, int paramInt2) {}
|
||||
public void endDocument() {}
|
||||
public void endElement(String paramString1, String paramString2, String paramString3) {}
|
||||
public void endPrefixMapping(String paramString) {}
|
||||
public void ignorableWhitespace(char[] paramArrayOfChar, int paramInt1, int paramInt2) {}
|
||||
public void processingInstruction(String paramString1, String paramString2) {}
|
||||
public void setDocumentLocator(Locator paramLocator) {}
|
||||
public void skippedEntity(String paramString) {}
|
||||
public void startDocument() {}
|
||||
public void characters(char[] paramArrayOfChar, int paramInt1, int paramInt2)
|
||||
{}
|
||||
|
||||
public void endDocument()
|
||||
{}
|
||||
|
||||
public void endElement(String paramString1, String paramString2, String paramString3)
|
||||
{}
|
||||
|
||||
public void endPrefixMapping(String paramString)
|
||||
{}
|
||||
|
||||
public void ignorableWhitespace(char[] paramArrayOfChar, int paramInt1, int paramInt2)
|
||||
{}
|
||||
|
||||
public void processingInstruction(String paramString1, String paramString2)
|
||||
{}
|
||||
|
||||
public void setDocumentLocator(Locator paramLocator)
|
||||
{}
|
||||
|
||||
public void skippedEntity(String paramString)
|
||||
{}
|
||||
|
||||
public void startDocument()
|
||||
{}
|
||||
|
||||
public void startElement(String paramString1, String paramString2,
|
||||
String paramString3, Attributes paramAttributes) {}
|
||||
public void startPrefixMapping(String paramString1, String paramString2) {}
|
||||
String paramString3, Attributes paramAttributes)
|
||||
{}
|
||||
|
||||
public void startPrefixMapping(String paramString1, String paramString2)
|
||||
{}
|
||||
}
|
||||
}
|
||||
|
@@ -26,7 +26,11 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.Parser;
|
||||
@@ -35,15 +39,12 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
|
||||
/**
|
||||
* {@code "application/dwg"} and {@code "image/vnd.dwg"} metadata extractor.
|
||||
*
|
||||
* Configuration: (see DWGMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
* Configuration: (see DWGMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>title:</b> -- cm:title
|
||||
@@ -72,7 +73,7 @@ public class DWGMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
Map<String, Serializable> properties, Map<String, String> headers)
|
||||
{
|
||||
putRawValue(KEY_KEYWORD, metadata.get(TikaCoreProperties.SUBJECT), properties);
|
||||
putRawValue(KEY_LAST_AUTHOR, metadata.get(TikaCoreProperties.MODIFIED), properties);
|
||||
|
@@ -1,165 +1,174 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.base.executors.RuntimeExec;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.alfresco.transform.tika.parsers.ExifToolParser;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
|
||||
@Component
|
||||
public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(IPTCMetadataExtractor.class);
|
||||
|
||||
private static Set<String> IPTC_DATE_KEYS = Set.of("XMP-photoshop:DateCreated", "XMP-iptcExt:ArtworkDateCreated");
|
||||
|
||||
private static final Pattern YEAR_IPTC = Pattern.compile("(\\d{4}[:|-]\\d{2}[:|-]\\d{2})");
|
||||
|
||||
private ExifToolParser parser;
|
||||
private RuntimeExec exifRuntimeExec;
|
||||
|
||||
public IPTCMetadataExtractor(RuntimeExec exifRuntimeExec) {
|
||||
super(EXTRACTOR, logger);
|
||||
this.exifRuntimeExec = exifRuntimeExec;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
if (this.parser == null) {
|
||||
this.parser = new ExifToolParser(exifRuntimeExec);
|
||||
}
|
||||
return this.parser;
|
||||
}
|
||||
|
||||
/**
|
||||
* Because some of the mimetypes that IPTCMetadataExtractor now parse, were previously handled
|
||||
* by TikaAutoMetadataExtractor we call the TikaAutoMetadataExtractor.extractSpecific method to
|
||||
* ensure that the returned properties contains the expected entries.
|
||||
*/
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata, Map<String, Serializable> properties,
|
||||
Map<String, String> headers)
|
||||
{
|
||||
properties = new TikaAutoMetadataExtractor().extractSpecific(metadata, properties, headers);
|
||||
ExifToolParser etParser = (ExifToolParser)this.getParser();
|
||||
if (etParser.getSeparator()!=null)
|
||||
{
|
||||
for (String key : properties.keySet())
|
||||
{
|
||||
if (properties.get(key) instanceof String)
|
||||
{
|
||||
String value = (String) properties.get(key);
|
||||
String separator = etParser.getSeparator();
|
||||
if (value.contains(separator))
|
||||
{
|
||||
if (value.contains(String.format("\"%s\"",separator)))
|
||||
{
|
||||
separator = String.format("\"%s\"",separator);
|
||||
}
|
||||
String [] values = StringUtils.splitByWholeSeparator(value, separator);
|
||||
// Change dateTime format. MM converted ':' to '-'
|
||||
if (IPTC_DATE_KEYS.contains(key)){
|
||||
values = iptcToIso8601DateStrings(values);
|
||||
}
|
||||
putRawValue(key, (Serializable) Arrays.asList(values), properties);
|
||||
}
|
||||
else if (IPTC_DATE_KEYS.contains(key)) {
|
||||
// Handle property with a single date string
|
||||
putRawValue(key, (Serializable) iptcToIso8601DateString(value), properties);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a date or date time strings into Iso8601 format <p>
|
||||
*
|
||||
* @param dateStrings
|
||||
* @return dateStrings in Iso8601 format
|
||||
* @see #iptcToIso8601DateString
|
||||
*/
|
||||
public String[] iptcToIso8601DateStrings(String[] dateStrings)
|
||||
{
|
||||
for (int i = 0; i < dateStrings.length; i++)
|
||||
{
|
||||
dateStrings[i] = iptcToIso8601DateString(dateStrings[i]);
|
||||
}
|
||||
return dateStrings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a date or date time string into Iso8601 format <p>
|
||||
* Converts any ':' in the year portion of a date string characters to '-'. <p>
|
||||
* Expects the year in the format YYYY:MM:DD or YYYY-MM-DD <p>
|
||||
* Will add the correct delimiter, 'T', to any dateTime strings, where | can be any char other than ,'T':
|
||||
* YYYY:MM:DD|HH:mm:ss.... or YYYY-MM-DD|HH:mm:ss....
|
||||
* <p>
|
||||
* Examples: <p><ul>
|
||||
* <li>"1919:10:16" will convert to "1919-10-16"</li>
|
||||
* <li>"1901:02:01 00:00:00.000Z" will convert to "1901-02-01T00:00:00.000Z"</li>
|
||||
* <li>"2001:02:01 16:15+00:00" will convert to "2001-02-01T16:15+00:00"</li>
|
||||
* <li>"2021-06-11 05:36-01:00" will convert to "2021-06-11T05:36-01:00"</li>
|
||||
* </ul>
|
||||
* @param dateStr
|
||||
* @return dateStr in Iso8601 format
|
||||
*/
|
||||
protected String iptcToIso8601DateString(String dateStr)
|
||||
{
|
||||
char timeSeparator = 'T';
|
||||
Matcher yearMatcher = YEAR_IPTC.matcher(dateStr);
|
||||
if (yearMatcher.find())
|
||||
{
|
||||
String year = yearMatcher.group(1);
|
||||
dateStr = yearMatcher.replaceFirst(year.replaceAll(":", "-"));
|
||||
if (dateStr.length()>year.length() && dateStr.charAt(year.length())!=timeSeparator)
|
||||
{
|
||||
dateStr = dateStr.replace(dateStr.charAt(year.length()), timeSeparator);
|
||||
}
|
||||
}
|
||||
return dateStr;
|
||||
}
|
||||
|
||||
}
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import org.alfresco.transform.base.executors.RuntimeExec;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.alfresco.transform.tika.parsers.ExifToolParser;
|
||||
|
||||
@Component
|
||||
public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(IPTCMetadataExtractor.class);
|
||||
|
||||
private static Set<String> IPTC_DATE_KEYS = Set.of("XMP-photoshop:DateCreated", "XMP-iptcExt:ArtworkDateCreated");
|
||||
|
||||
private static final Pattern YEAR_IPTC = Pattern.compile("(\\d{4}[:|-]\\d{2}[:|-]\\d{2})");
|
||||
|
||||
private ExifToolParser parser;
|
||||
private RuntimeExec exifRuntimeExec;
|
||||
|
||||
public IPTCMetadataExtractor(RuntimeExec exifRuntimeExec)
|
||||
{
|
||||
super(EXTRACTOR, logger);
|
||||
this.exifRuntimeExec = exifRuntimeExec;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
if (this.parser == null)
|
||||
{
|
||||
this.parser = new ExifToolParser(exifRuntimeExec);
|
||||
}
|
||||
return this.parser;
|
||||
}
|
||||
|
||||
/**
|
||||
* Because some of the mimetypes that IPTCMetadataExtractor now parse, were previously handled by TikaAutoMetadataExtractor we call the TikaAutoMetadataExtractor.extractSpecific method to ensure that the returned properties contains the expected entries.
|
||||
*/
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata, Map<String, Serializable> properties,
|
||||
Map<String, String> headers)
|
||||
{
|
||||
properties = new TikaAutoMetadataExtractor().extractSpecific(metadata, properties, headers);
|
||||
ExifToolParser etParser = (ExifToolParser) this.getParser();
|
||||
if (etParser.getSeparator() != null)
|
||||
{
|
||||
for (String key : properties.keySet())
|
||||
{
|
||||
if (properties.get(key) instanceof String)
|
||||
{
|
||||
String value = (String) properties.get(key);
|
||||
String separator = etParser.getSeparator();
|
||||
if (value.contains(separator))
|
||||
{
|
||||
if (value.contains(String.format("\"%s\"", separator)))
|
||||
{
|
||||
separator = String.format("\"%s\"", separator);
|
||||
}
|
||||
String[] values = StringUtils.splitByWholeSeparator(value, separator);
|
||||
// Change dateTime format. MM converted ':' to '-'
|
||||
if (IPTC_DATE_KEYS.contains(key))
|
||||
{
|
||||
values = iptcToIso8601DateStrings(values);
|
||||
}
|
||||
putRawValue(key, (Serializable) Arrays.asList(values), properties);
|
||||
}
|
||||
else if (IPTC_DATE_KEYS.contains(key))
|
||||
{
|
||||
// Handle property with a single date string
|
||||
putRawValue(key, (Serializable) iptcToIso8601DateString(value), properties);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a date or date time strings into Iso8601 format
|
||||
* <p>
|
||||
*
|
||||
* @param dateStrings
|
||||
* @return dateStrings in Iso8601 format
|
||||
* @see #iptcToIso8601DateString
|
||||
*/
|
||||
public String[] iptcToIso8601DateStrings(String[] dateStrings)
|
||||
{
|
||||
for (int i = 0; i < dateStrings.length; i++)
|
||||
{
|
||||
dateStrings[i] = iptcToIso8601DateString(dateStrings[i]);
|
||||
}
|
||||
return dateStrings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a date or date time string into Iso8601 format
|
||||
* <p>
|
||||
* Converts any ':' in the year portion of a date string characters to '-'.
|
||||
* <p>
|
||||
* Expects the year in the format YYYY:MM:DD or YYYY-MM-DD
|
||||
* <p>
|
||||
* Will add the correct delimiter, 'T', to any dateTime strings, where | can be any char other than ,'T': YYYY:MM:DD|HH:mm:ss.... or YYYY-MM-DD|HH:mm:ss....
|
||||
* <p>
|
||||
* Examples:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>"1919:10:16" will convert to "1919-10-16"</li>
|
||||
* <li>"1901:02:01 00:00:00.000Z" will convert to "1901-02-01T00:00:00.000Z"</li>
|
||||
* <li>"2001:02:01 16:15+00:00" will convert to "2001-02-01T16:15+00:00"</li>
|
||||
* <li>"2021-06-11 05:36-01:00" will convert to "2021-06-11T05:36-01:00"</li>
|
||||
* </ul>
|
||||
*
|
||||
* @param dateStr
|
||||
* @return dateStr in Iso8601 format
|
||||
*/
|
||||
protected String iptcToIso8601DateString(String dateStr)
|
||||
{
|
||||
char timeSeparator = 'T';
|
||||
Matcher yearMatcher = YEAR_IPTC.matcher(dateStr);
|
||||
if (yearMatcher.find())
|
||||
{
|
||||
String year = yearMatcher.group(1);
|
||||
dateStr = yearMatcher.replaceFirst(year.replaceAll(":", "-"));
|
||||
if (dateStr.length() > year.length() && dateStr.charAt(year.length()) != timeSeparator)
|
||||
{
|
||||
dateStr = dateStr.replace(dateStr.charAt(year.length()), timeSeparator);
|
||||
}
|
||||
}
|
||||
return dateStr;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -26,6 +26,9 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.metadata.XMPDM;
|
||||
@@ -35,13 +38,10 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* MP3 file metadata extractor.
|
||||
*
|
||||
* Configuration: (see MP3MetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
* Configuration: (see MP3MetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>songTitle:</b> -- cm:title
|
||||
@@ -56,8 +56,7 @@ import java.util.Map;
|
||||
* <b>lyrics:</b> --
|
||||
* </pre>
|
||||
*
|
||||
* Note - XMPDM metadata keys are also emitted, in common with
|
||||
* the other Tika powered extracters
|
||||
* Note - XMPDM metadata keys are also emitted, in common with the other Tika powered extracters
|
||||
*
|
||||
* Uses Apache Tika
|
||||
*
|
||||
@@ -91,14 +90,14 @@ public class MP3MetadataExtractor extends TikaAudioMetadataExtractor
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
Map<String, Serializable> properties, Map<String, String> headers)
|
||||
{
|
||||
// Do the normal Audio mappings
|
||||
super.extractSpecific(metadata, properties, headers);
|
||||
|
||||
// Now do the compatibility ones
|
||||
// We only need these for people who had pre-existing mapping
|
||||
// properties from before the proper audio model was added
|
||||
// properties from before the proper audio model was added
|
||||
putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
|
||||
putRawValue(KEY_SONG_TITLE, metadata.get(TikaCoreProperties.TITLE), properties);
|
||||
putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
|
||||
|
@@ -26,7 +26,11 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.tika.metadata.Message;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
@@ -36,15 +40,12 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
|
||||
/**
|
||||
* Outlook MAPI format email metadata extractor.
|
||||
*
|
||||
* Configuration: (see MailMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
* Configuration: (see MailMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>sentDate:</b> -- cm:sentdate
|
||||
@@ -57,8 +58,7 @@ import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbe
|
||||
* <b>bccNames:</b> --
|
||||
* </pre>
|
||||
*
|
||||
* TIKA note - to/cc/bcc go into the html part, not the metadata.
|
||||
* Also, email addresses not included as yet.
|
||||
* TIKA note - to/cc/bcc go into the html part, not the metadata. Also, email addresses not included as yet.
|
||||
*
|
||||
* @author Kevin Roast
|
||||
* @author adavis
|
||||
@@ -91,7 +91,7 @@ public class MailMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
Map<String, Serializable> properties, Map<String, String> headers)
|
||||
{
|
||||
putRawValue(KEY_ORIGINATOR, metadata.get(TikaCoreProperties.CREATOR), properties);
|
||||
putRawValue(KEY_SUBJECT, metadata.get(TikaCoreProperties.TITLE), properties);
|
||||
|
@@ -26,7 +26,11 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Office;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
@@ -36,17 +40,15 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
|
||||
/**
|
||||
* Office file format metadata extractor.
|
||||
*
|
||||
* Configuration: (see OfficeMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
* Configuration: (see OfficeMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* This extractor uses the POI library to extract the following:
|
||||
*
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
@@ -100,7 +102,7 @@ public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractorEmbedd
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
Map<String, Serializable> properties, Map<String, String> headers)
|
||||
{
|
||||
putRawValue(KEY_CREATE_DATETIME, metadata.get(TikaCoreProperties.CREATED), properties);
|
||||
putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(TikaCoreProperties.MODIFIED), properties);
|
||||
|
@@ -26,7 +26,17 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import static org.apache.tika.metadata.DublinCore.NAMESPACE_URI_DC;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
@@ -42,20 +52,12 @@ import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.xml.sax.ContentHandler;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
import static org.apache.tika.metadata.DublinCore.NAMESPACE_URI_DC;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
|
||||
/**
|
||||
* {@code "application/vnd.oasis.opendocument..."} and {@code "applicationvnd.oasis.opendocument..."} metadata extractor.
|
||||
*
|
||||
* Configuration: (see OpenDocumentMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
* Configuration: (see OpenDocumentMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>creationDate:</b> -- cm:created
|
||||
@@ -120,7 +122,7 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String, String> headers)
|
||||
Map<String, Serializable> properties, Map<String, String> headers)
|
||||
{
|
||||
putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(TikaCoreProperties.CREATED)), properties);
|
||||
final String creator = getCreator(metadata);
|
||||
@@ -167,8 +169,7 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
return dateFormatter.parseDateTime(dateString).toDate();
|
||||
}
|
||||
catch (IllegalArgumentException ignore)
|
||||
{
|
||||
}
|
||||
{}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
@@ -26,8 +26,8 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.alfresco.transform.tika.transformers.Tika;
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.Parser;
|
||||
@@ -36,12 +36,13 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.alfresco.transform.tika.transformers.Tika;
|
||||
|
||||
/**
|
||||
* Metadata extractor for the PDF documents.
|
||||
*
|
||||
* Configuration: (see PdfBoxMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
* Configuration: (see PdfBoxMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
|
@@ -26,19 +26,20 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
|
||||
/**
|
||||
* POI-based metadata extractor for Office 07 documents. See http://poi.apache.org/ for information on POI.
|
||||
*
|
||||
* Configuration: (see PoiMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
* Configuration: (see PoiMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
@@ -47,7 +48,7 @@ import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbe
|
||||
* <b>created:</b> -- cm:created
|
||||
* <b>Any custom property:</b> -- [not mapped]
|
||||
* </pre>
|
||||
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author Neil McErlean
|
||||
* @author Dmitry Velichkevich
|
||||
|
@@ -26,7 +26,13 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Calendar;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
@@ -40,19 +46,12 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Calendar;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
|
||||
/**
|
||||
* A Metadata Extractor which makes use of the Apache Tika Audio Parsers to extract metadata from media files.
|
||||
* For backwards compatibility reasons, this doesn't handle the MP3 format, which has its own dedicated extractor
|
||||
* in {@link MP3MetadataExtractor}
|
||||
* A Metadata Extractor which makes use of the Apache Tika Audio Parsers to extract metadata from media files. For backwards compatibility reasons, this doesn't handle the MP3 format, which has its own dedicated extractor in {@link MP3MetadataExtractor}
|
||||
*
|
||||
* Configuration: (see TikaAudioMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
* Configuration: (see TikaAudioMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
@@ -75,7 +74,7 @@ public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractorEmb
|
||||
private static final Logger logger = LoggerFactory.getLogger(TikaAudioMetadataExtractor.class);
|
||||
|
||||
// The Audio related parsers we use
|
||||
private static final Parser[] parsers = new Parser[] {
|
||||
private static final Parser[] parsers = new Parser[]{
|
||||
new VorbisParser(),
|
||||
new FlacParser(),
|
||||
new MP4Parser()
|
||||
@@ -102,7 +101,7 @@ public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractorEmb
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
Map<String, Serializable> properties, Map<String, String> headers)
|
||||
{
|
||||
// Most things can go with the default Tika -> Alfresco Mapping
|
||||
// Handle the few special cases here
|
||||
@@ -124,21 +123,20 @@ public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractorEmb
|
||||
private Serializable generateReleaseDate(Metadata metadata)
|
||||
{
|
||||
String date = metadata.get(XMPDM.RELEASE_DATE);
|
||||
if(date == null || date.length() == 0)
|
||||
if (date == null || date.length() == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Is it just a year?
|
||||
if(date.matches("\\d\\d\\d\\d"))
|
||||
if (date.matches("\\d\\d\\d\\d"))
|
||||
{
|
||||
// Just a year, we need a full date
|
||||
// Go for the 1st of the 1st
|
||||
Calendar c = Calendar.getInstance();
|
||||
c.set(
|
||||
Integer.parseInt(date), Calendar.JANUARY, 1,
|
||||
0, 0, 0
|
||||
);
|
||||
0, 0, 0);
|
||||
c.set(Calendar.MILLISECOND, 0);
|
||||
return c.getTime();
|
||||
}
|
||||
@@ -150,8 +148,9 @@ public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractorEmb
|
||||
/**
|
||||
* Generate the description
|
||||
*
|
||||
* @param metadata the metadata extracted from the file
|
||||
* @return the description
|
||||
* @param metadata
|
||||
* the metadata extracted from the file
|
||||
* @return the description
|
||||
*/
|
||||
private String generateDescription(Metadata metadata)
|
||||
{
|
||||
|
@@ -26,7 +26,13 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TIFF;
|
||||
@@ -36,19 +42,12 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EXTRACTOR;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
|
||||
/**
|
||||
* A Metadata Extractor which makes use of the Apache Tika auto-detection to select the best parser to extract the
|
||||
* metadata from a document. This will be used for all files which Tika can handle, but where no other more explicit
|
||||
* extractor is defined.
|
||||
* A Metadata Extractor which makes use of the Apache Tika auto-detection to select the best parser to extract the metadata from a document. This will be used for all files which Tika can handle, but where no other more explicit extractor is defined.
|
||||
*
|
||||
* Configuration: (see TikaAutoMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
* Configuration: (see TikaAutoMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
@@ -92,27 +91,21 @@ public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractorEmbe
|
||||
}
|
||||
|
||||
/**
|
||||
* Because some editors use JPEG_IMAGE_HEIGHT_TAG when
|
||||
* saving JPEG images , a more reliable source for
|
||||
* image size are the values provided by Tika
|
||||
* and not the exif/tiff metadata read from the file
|
||||
* This will override the tiff:Image size
|
||||
* which gets embedded into the alfresco node properties
|
||||
* for jpeg files that contain such exif information
|
||||
* Because some editors use JPEG_IMAGE_HEIGHT_TAG when saving JPEG images , a more reliable source for image size are the values provided by Tika and not the exif/tiff metadata read from the file This will override the tiff:Image size which gets embedded into the alfresco node properties for jpeg files that contain such exif information
|
||||
*/
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
Map<String, Serializable> properties, Map<String, String> headers)
|
||||
{
|
||||
if (MIMETYPE_IMAGE_JPEG.equals(metadata.get(Metadata.CONTENT_TYPE)))
|
||||
{
|
||||
//check if the image has exif information
|
||||
// check if the image has exif information
|
||||
if (metadata.get(EXIF_IMAGE_WIDTH_TAG) != null
|
||||
&& metadata.get(EXIF_IMAGE_HEIGHT_TAG) != null
|
||||
&& metadata.get(COMPRESSION_TAG) != null)
|
||||
&& metadata.get(EXIF_IMAGE_HEIGHT_TAG) != null
|
||||
&& metadata.get(COMPRESSION_TAG) != null)
|
||||
{
|
||||
//replace the exif size properties that will be embedded in the node with
|
||||
//the guessed dimensions from Tika
|
||||
// replace the exif size properties that will be embedded in the node with
|
||||
// the guessed dimensions from Tika
|
||||
putRawValue(TIFF.IMAGE_LENGTH.getName(), extractSize(metadata.get(EXIF_IMAGE_HEIGHT_TAG)), properties);
|
||||
putRawValue(TIFF.IMAGE_WIDTH.getName(), extractSize(metadata.get(EXIF_IMAGE_WIDTH_TAG)), properties);
|
||||
putRawValue(JPEG_IMAGE_HEIGHT_TAG, metadata.get(EXIF_IMAGE_HEIGHT_TAG), properties);
|
||||
@@ -123,18 +116,18 @@ public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractorEmbe
|
||||
}
|
||||
|
||||
/**
|
||||
* Exif metadata for size also returns the string "pixels"
|
||||
* after the number value , this function will
|
||||
* stop at the first non digit character found in the text
|
||||
* @param sizeText string text
|
||||
* Exif metadata for size also returns the string "pixels" after the number value , this function will stop at the first non digit character found in the text
|
||||
*
|
||||
* @param sizeText
|
||||
* string text
|
||||
* @return the size value
|
||||
*/
|
||||
private String extractSize(String sizeText)
|
||||
{
|
||||
StringBuilder sizeValue = new StringBuilder();
|
||||
for(char c : sizeText.toCharArray())
|
||||
for (char c : sizeText.toCharArray())
|
||||
{
|
||||
if(Character.isDigit(c))
|
||||
if (Character.isDigit(c))
|
||||
{
|
||||
sizeValue.append(c);
|
||||
}
|
||||
|
@@ -1,397 +1,494 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.parsers;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_TIFF;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Reader;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.alfresco.transform.base.executors.RuntimeExec;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.output.NullOutputStream;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.io.TemporaryResources;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.external.ExternalParser;
|
||||
import org.apache.tika.parser.external.ExternalParsersFactory;
|
||||
import org.apache.tika.parser.image.ImageParser;
|
||||
import org.apache.tika.parser.image.TiffParser;
|
||||
import org.apache.tika.parser.image.JpegParser;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
public class ExifToolParser extends ExternalParser {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExifToolParser.class);
|
||||
|
||||
private static final String EXIFTOOL_PARSER_CONFIG = "parsers/external/config/exiftool-parser.xml";
|
||||
|
||||
protected static final String DEFAULT_SEPARATOR = ", ";
|
||||
protected static final String SEPARATOR_SETTING = "-sep";
|
||||
|
||||
private String separator;
|
||||
|
||||
public ExifToolParser() {
|
||||
this(null);
|
||||
}
|
||||
|
||||
public ExifToolParser(RuntimeExec exifRuntimeExec) {
|
||||
super();
|
||||
try {
|
||||
List<ExternalParser> eParsers = ExternalParsersFactory.create(getExternalParserConfigURL());
|
||||
// if ExifTool is not installed then no parsers are returned
|
||||
if (eParsers.size() > 0) {
|
||||
ExternalParser eParser = eParsers.get(0);
|
||||
|
||||
String[] commandToBeExecuted;
|
||||
if (exifRuntimeExec==null) {
|
||||
logger.debug("Command to be executed determined from Tika ExternalParser");
|
||||
commandToBeExecuted = eParser.getCommand();
|
||||
} else {
|
||||
logger.debug("Command to be executed determined from RuntimeExec");
|
||||
commandToBeExecuted = exifRuntimeExec.getCommand();
|
||||
}
|
||||
if (commandToBeExecuted==null || commandToBeExecuted.length==0) {
|
||||
commandToBeExecuted = eParser.getCommand();
|
||||
}
|
||||
|
||||
String commandToBeExecutedAsString = String.join( " ", commandToBeExecuted);
|
||||
logger.debug("Command to be executed: " + commandToBeExecutedAsString );
|
||||
|
||||
this.setCommand(commandToBeExecutedAsString);
|
||||
this.setIgnoredLineConsumer(eParser.getIgnoredLineConsumer());
|
||||
this.setMetadataExtractionPatterns(eParser.getMetadataExtractionPatterns());
|
||||
this.setSupportedTypes(eParser.getSupportedTypes());
|
||||
} else {
|
||||
logger.error(
|
||||
"Error creating ExifToolParser from config, ExifToolExtractions not enabled. Please check ExifTool is installed correctly.");
|
||||
}
|
||||
} catch (IOException | TikaException e) {
|
||||
logger.error("Error creating ExifToolParser from config, ExifToolExtractions not enabled: ", e);
|
||||
}
|
||||
}
|
||||
|
||||
private URL getExternalParserConfigURL(){
|
||||
ClassLoader classLoader = ExifToolParser.class.getClassLoader();
|
||||
return classLoader.getResource(EXIFTOOL_PARSER_CONFIG);
|
||||
}
|
||||
|
||||
public void setSeparator(String sep) {
|
||||
this.separator = sep;
|
||||
}
|
||||
|
||||
public String getSeparator() {
|
||||
return this.separator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCommand(String... command){
|
||||
super.setCommand(command);
|
||||
if (command.length==1) {
|
||||
setSeparator(findSeparator(command[0]));
|
||||
}
|
||||
else {
|
||||
setSeparator(DEFAULT_SEPARATOR);
|
||||
}
|
||||
}
|
||||
|
||||
protected String findSeparator(String command) {
|
||||
if (command.contains(SEPARATOR_SETTING)) {
|
||||
int start = command.indexOf(SEPARATOR_SETTING)+SEPARATOR_SETTING.length()+1;
|
||||
String separator = DEFAULT_SEPARATOR;
|
||||
if (command.charAt(start)=='\"') {
|
||||
//get all chars up to the next \"
|
||||
int end = command.indexOf("\"", start+1);
|
||||
separator = command.substring(start+1, end);
|
||||
}
|
||||
else {
|
||||
int end = command.indexOf(" ", start);
|
||||
separator = command.substring(start, end);
|
||||
}
|
||||
return separator;
|
||||
}
|
||||
return DEFAULT_SEPARATOR;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser}
|
||||
* due to errors attempting to {@link #extractMetadata} from the errorStream in original implementation. <p>
|
||||
* Executes the configured external command and passes the given document
|
||||
* stream as a simple XHTML document to the given SAX content handler.
|
||||
* Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
|
||||
* has been called to set patterns.
|
||||
*/
|
||||
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
|
||||
throws IOException, SAXException, TikaException {
|
||||
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
|
||||
|
||||
MediaType mediaType = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
|
||||
TemporaryResources tmp = new TemporaryResources();
|
||||
try {
|
||||
TikaInputStream tis = TikaInputStream.get(stream, tmp);
|
||||
|
||||
if (this.getSupportedTypes().contains(mediaType)) {
|
||||
parse(tis, xhtml, metadata, tmp);
|
||||
}
|
||||
|
||||
switch (mediaType.getType()+"/"+mediaType.getSubtype()) {
|
||||
case MIMETYPE_IMAGE_JPEG:
|
||||
parseAdditional(new JpegParser(), tis, handler, metadata, context, mediaType);
|
||||
break;
|
||||
case MIMETYPE_IMAGE_TIFF:
|
||||
parseAdditional(new TiffParser(), tis, handler, metadata, context, mediaType);
|
||||
break;
|
||||
default:
|
||||
parseAdditional(new ImageParser(), tis, handler, metadata, context, mediaType);
|
||||
}
|
||||
} finally {
|
||||
tmp.dispose();
|
||||
}
|
||||
}
|
||||
|
||||
private void parseAdditional(Parser parser, TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context,
|
||||
MediaType mediaType) throws IOException, SAXException, TikaException {
|
||||
if (parser.getSupportedTypes(context).contains(mediaType)) {
|
||||
parser.parse(tis, handler, metadata, context);
|
||||
}
|
||||
}
|
||||
|
||||
private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, TemporaryResources tmp)
|
||||
throws IOException, SAXException, TikaException {
|
||||
boolean inputToStdIn = true;
|
||||
boolean outputFromStdOut = true;
|
||||
boolean hasPatterns = (getMetadataExtractionPatterns() != null && !getMetadataExtractionPatterns().isEmpty());
|
||||
|
||||
File output = null;
|
||||
|
||||
// Build our getCommand()
|
||||
String[] cmd;
|
||||
if (getCommand().length == 1) {
|
||||
cmd = getCommand()[0].split(" ");
|
||||
} else {
|
||||
cmd = new String[getCommand().length];
|
||||
System.arraycopy(getCommand(), 0, cmd, 0, getCommand().length);
|
||||
}
|
||||
for (int i = 0; i < cmd.length; i++) {
|
||||
if (cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
|
||||
cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, stream.getFile().getPath());
|
||||
inputToStdIn = false;
|
||||
}
|
||||
if (cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
|
||||
output = tmp.createTemporaryFile();
|
||||
outputFromStdOut = false;
|
||||
cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
|
||||
}
|
||||
}
|
||||
|
||||
// Execute
|
||||
Process process = null;
|
||||
try {
|
||||
if (cmd.length == 1) {
|
||||
process = Runtime.getRuntime().exec(cmd[0]);
|
||||
} else {
|
||||
process = Runtime.getRuntime().exec(cmd);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
try {
|
||||
if (inputToStdIn) {
|
||||
sendInput(process, stream);
|
||||
} else {
|
||||
process.getOutputStream().close();
|
||||
}
|
||||
|
||||
InputStream out = process.getInputStream();
|
||||
InputStream err = process.getErrorStream();
|
||||
|
||||
if (hasPatterns) {
|
||||
|
||||
if (outputFromStdOut) {
|
||||
extractOutput(out, xhtml);
|
||||
} else {
|
||||
extractMetadata(out, metadata);
|
||||
}
|
||||
} else {
|
||||
ignoreStream(err);
|
||||
|
||||
if (outputFromStdOut) {
|
||||
extractOutput(out, xhtml);
|
||||
} else {
|
||||
ignoreStream(out);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
try {
|
||||
process.waitFor();
|
||||
} catch (InterruptedException ignore) {
|
||||
}
|
||||
}
|
||||
|
||||
// Grab the output if we haven't already
|
||||
if (!outputFromStdOut) {
|
||||
extractOutput(new FileInputStream(output), xhtml);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser}<p>
|
||||
* Starts a thread that extracts the contents of the standard output
|
||||
* stream of the given process to the given XHTML content handler.
|
||||
* The standard output stream is closed once fully processed.
|
||||
*
|
||||
* @param stream stream
|
||||
* @param xhtml XHTML content handler
|
||||
* @throws SAXException if the XHTML SAX events could not be handled
|
||||
* @throws IOException if an input error occurred
|
||||
*/
|
||||
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
|
||||
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
|
||||
xhtml.startDocument();
|
||||
xhtml.startElement("p");
|
||||
char[] buffer = new char[1024];
|
||||
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
|
||||
xhtml.characters(buffer, 0, n);
|
||||
}
|
||||
xhtml.endElement("p");
|
||||
xhtml.endDocument();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser}<p>
|
||||
* Starts a thread that sends the contents of the given input stream
|
||||
* to the standard input stream of the given process. Potential
|
||||
* exceptions are ignored, and the standard input stream is closed
|
||||
* once fully processed. Note that the given input stream is <em>not</em>
|
||||
* closed by this method.
|
||||
*
|
||||
* @param process process
|
||||
* @param stream input stream
|
||||
*/
|
||||
private void sendInput(final Process process, final InputStream stream) {
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
OutputStream stdin = process.getOutputStream();
|
||||
try {
|
||||
IOUtils.copy(stream, stdin);
|
||||
} catch (IOException e) {
|
||||
logger.error( e.getMessage());
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
try {
|
||||
t.join();
|
||||
} catch (InterruptedException ignore) {
|
||||
logger.error(ignore.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser}<p>
|
||||
* Starts a thread that reads and discards the contents of the
|
||||
* standard stream of the given process. Potential exceptions
|
||||
* are ignored, and the stream is closed once fully processed.
|
||||
*
|
||||
* @param stream stream
|
||||
*/
|
||||
private void ignoreStream(final InputStream stream) {
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
try {
|
||||
IOUtils.copy(stream, NullOutputStream.NULL_OUTPUT_STREAM);
|
||||
} catch (IOException e) {
|
||||
} finally {
|
||||
IOUtils.closeQuietly(stream);
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
try {
|
||||
t.join();
|
||||
} catch (InterruptedException ignore) {
|
||||
}
|
||||
}
|
||||
|
||||
private void extractMetadata(final InputStream stream, final Metadata metadata) {
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
BufferedReader reader;
|
||||
reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
|
||||
try {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
for (Pattern p : getMetadataExtractionPatterns().keySet()) {
|
||||
Matcher m = p.matcher(line);
|
||||
if (m.find()) {
|
||||
if (getMetadataExtractionPatterns().get(p) != null
|
||||
&& !getMetadataExtractionPatterns().get(p).equals("")) {
|
||||
metadata.add(getMetadataExtractionPatterns().get(p), m.group(1));
|
||||
} else {
|
||||
metadata.add(m.group(1), m.group(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// Ignore
|
||||
} finally {
|
||||
IOUtils.closeQuietly(reader);
|
||||
IOUtils.closeQuietly(stream);
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
try {
|
||||
t.join();
|
||||
} catch (InterruptedException ignore) {
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.parsers;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_TIFF;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Reader;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.output.NullOutputStream;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.io.TemporaryResources;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.external.ExternalParser;
|
||||
import org.apache.tika.parser.external.ExternalParsersFactory;
|
||||
import org.apache.tika.parser.image.ImageParser;
|
||||
import org.apache.tika.parser.image.JpegParser;
|
||||
import org.apache.tika.parser.image.TiffParser;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import org.alfresco.transform.base.executors.RuntimeExec;
|
||||
|
||||
public class ExifToolParser extends ExternalParser
|
||||
{
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExifToolParser.class);
|
||||
|
||||
private static final String EXIFTOOL_PARSER_CONFIG = "parsers/external/config/exiftool-parser.xml";
|
||||
|
||||
protected static final String DEFAULT_SEPARATOR = ", ";
|
||||
protected static final String SEPARATOR_SETTING = "-sep";
|
||||
|
||||
private String separator;
|
||||
|
||||
public ExifToolParser()
|
||||
{
|
||||
this(null);
|
||||
}
|
||||
|
||||
public ExifToolParser(RuntimeExec exifRuntimeExec)
|
||||
{
|
||||
super();
|
||||
try
|
||||
{
|
||||
List<ExternalParser> eParsers = ExternalParsersFactory.create(getExternalParserConfigURL());
|
||||
// if ExifTool is not installed then no parsers are returned
|
||||
if (eParsers.size() > 0)
|
||||
{
|
||||
ExternalParser eParser = eParsers.get(0);
|
||||
|
||||
String[] commandToBeExecuted;
|
||||
if (exifRuntimeExec == null)
|
||||
{
|
||||
logger.debug("Command to be executed determined from Tika ExternalParser");
|
||||
commandToBeExecuted = eParser.getCommand();
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.debug("Command to be executed determined from RuntimeExec");
|
||||
commandToBeExecuted = exifRuntimeExec.getCommand();
|
||||
}
|
||||
if (commandToBeExecuted == null || commandToBeExecuted.length == 0)
|
||||
{
|
||||
commandToBeExecuted = eParser.getCommand();
|
||||
}
|
||||
|
||||
String commandToBeExecutedAsString = String.join(" ", commandToBeExecuted);
|
||||
logger.debug("Command to be executed: " + commandToBeExecutedAsString);
|
||||
|
||||
this.setCommand(commandToBeExecutedAsString);
|
||||
this.setIgnoredLineConsumer(eParser.getIgnoredLineConsumer());
|
||||
this.setMetadataExtractionPatterns(eParser.getMetadataExtractionPatterns());
|
||||
this.setSupportedTypes(eParser.getSupportedTypes());
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.error(
|
||||
"Error creating ExifToolParser from config, ExifToolExtractions not enabled. Please check ExifTool is installed correctly.");
|
||||
}
|
||||
}
|
||||
catch (IOException | TikaException e)
|
||||
{
|
||||
logger.error("Error creating ExifToolParser from config, ExifToolExtractions not enabled: ", e);
|
||||
}
|
||||
}
|
||||
|
||||
private URL getExternalParserConfigURL()
|
||||
{
|
||||
ClassLoader classLoader = ExifToolParser.class.getClassLoader();
|
||||
return classLoader.getResource(EXIFTOOL_PARSER_CONFIG);
|
||||
}
|
||||
|
||||
public void setSeparator(String sep)
|
||||
{
|
||||
this.separator = sep;
|
||||
}
|
||||
|
||||
public String getSeparator()
|
||||
{
|
||||
return this.separator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCommand(String... command)
|
||||
{
|
||||
super.setCommand(command);
|
||||
if (command.length == 1)
|
||||
{
|
||||
setSeparator(findSeparator(command[0]));
|
||||
}
|
||||
else
|
||||
{
|
||||
setSeparator(DEFAULT_SEPARATOR);
|
||||
}
|
||||
}
|
||||
|
||||
protected String findSeparator(String command)
|
||||
{
|
||||
if (command.contains(SEPARATOR_SETTING))
|
||||
{
|
||||
int start = command.indexOf(SEPARATOR_SETTING) + SEPARATOR_SETTING.length() + 1;
|
||||
String separator = DEFAULT_SEPARATOR;
|
||||
if (command.charAt(start) == '\"')
|
||||
{
|
||||
// get all chars up to the next \"
|
||||
int end = command.indexOf("\"", start + 1);
|
||||
separator = command.substring(start + 1, end);
|
||||
}
|
||||
else
|
||||
{
|
||||
int end = command.indexOf(" ", start);
|
||||
separator = command.substring(start, end);
|
||||
}
|
||||
return separator;
|
||||
}
|
||||
return DEFAULT_SEPARATOR;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser} due to errors attempting to {@link #extractMetadata} from the errorStream in original implementation.
|
||||
* <p>
|
||||
* Executes the configured external command and passes the given document stream as a simple XHTML document to the given SAX content handler. Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)} has been called to set patterns.
|
||||
*/
|
||||
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
|
||||
throws IOException, SAXException, TikaException
|
||||
{
|
||||
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
|
||||
|
||||
MediaType mediaType = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
|
||||
TemporaryResources tmp = new TemporaryResources();
|
||||
try
|
||||
{
|
||||
TikaInputStream tis = TikaInputStream.get(stream, tmp);
|
||||
|
||||
if (this.getSupportedTypes().contains(mediaType))
|
||||
{
|
||||
parse(tis, xhtml, metadata, tmp);
|
||||
}
|
||||
|
||||
switch (mediaType.getType() + "/" + mediaType.getSubtype())
|
||||
{
|
||||
case MIMETYPE_IMAGE_JPEG:
|
||||
parseAdditional(new JpegParser(), tis, handler, metadata, context, mediaType);
|
||||
break;
|
||||
case MIMETYPE_IMAGE_TIFF:
|
||||
parseAdditional(new TiffParser(), tis, handler, metadata, context, mediaType);
|
||||
break;
|
||||
default:
|
||||
parseAdditional(new ImageParser(), tis, handler, metadata, context, mediaType);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
tmp.dispose();
|
||||
}
|
||||
}
|
||||
|
||||
private void parseAdditional(Parser parser, TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context,
|
||||
MediaType mediaType) throws IOException, SAXException, TikaException
|
||||
{
|
||||
if (parser.getSupportedTypes(context).contains(mediaType))
|
||||
{
|
||||
parser.parse(tis, handler, metadata, context);
|
||||
}
|
||||
}
|
||||
|
||||
private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, TemporaryResources tmp)
|
||||
throws IOException, SAXException, TikaException
|
||||
{
|
||||
boolean inputToStdIn = true;
|
||||
boolean outputFromStdOut = true;
|
||||
boolean hasPatterns = (getMetadataExtractionPatterns() != null && !getMetadataExtractionPatterns().isEmpty());
|
||||
|
||||
File output = null;
|
||||
|
||||
// Build our getCommand()
|
||||
String[] cmd;
|
||||
if (getCommand().length == 1)
|
||||
{
|
||||
cmd = getCommand()[0].split(" ");
|
||||
}
|
||||
else
|
||||
{
|
||||
cmd = new String[getCommand().length];
|
||||
System.arraycopy(getCommand(), 0, cmd, 0, getCommand().length);
|
||||
}
|
||||
for (int i = 0; i < cmd.length; i++)
|
||||
{
|
||||
if (cmd[i].indexOf(INPUT_FILE_TOKEN) != -1)
|
||||
{
|
||||
cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, stream.getFile().getPath());
|
||||
inputToStdIn = false;
|
||||
}
|
||||
if (cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1)
|
||||
{
|
||||
output = tmp.createTemporaryFile();
|
||||
outputFromStdOut = false;
|
||||
cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
|
||||
}
|
||||
}
|
||||
|
||||
// Execute
|
||||
Process process = null;
|
||||
try
|
||||
{
|
||||
if (cmd.length == 1)
|
||||
{
|
||||
process = Runtime.getRuntime().exec(cmd[0]);
|
||||
}
|
||||
else
|
||||
{
|
||||
process = Runtime.getRuntime().exec(cmd);
|
||||
}
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
if (inputToStdIn)
|
||||
{
|
||||
sendInput(process, stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
process.getOutputStream().close();
|
||||
}
|
||||
|
||||
InputStream out = process.getInputStream();
|
||||
InputStream err = process.getErrorStream();
|
||||
|
||||
if (hasPatterns)
|
||||
{
|
||||
|
||||
if (outputFromStdOut)
|
||||
{
|
||||
extractOutput(out, xhtml);
|
||||
}
|
||||
else
|
||||
{
|
||||
extractMetadata(out, metadata);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ignoreStream(err);
|
||||
|
||||
if (outputFromStdOut)
|
||||
{
|
||||
extractOutput(out, xhtml);
|
||||
}
|
||||
else
|
||||
{
|
||||
ignoreStream(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
try
|
||||
{
|
||||
process.waitFor();
|
||||
}
|
||||
catch (InterruptedException ignore)
|
||||
{}
|
||||
}
|
||||
|
||||
// Grab the output if we haven't already
|
||||
if (!outputFromStdOut)
|
||||
{
|
||||
extractOutput(new FileInputStream(output), xhtml);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser}
|
||||
* <p>
|
||||
* Starts a thread that extracts the contents of the standard output stream of the given process to the given XHTML content handler. The standard output stream is closed once fully processed.
|
||||
*
|
||||
* @param stream
|
||||
* stream
|
||||
* @param xhtml
|
||||
* XHTML content handler
|
||||
* @throws SAXException
|
||||
* if the XHTML SAX events could not be handled
|
||||
* @throws IOException
|
||||
* if an input error occurred
|
||||
*/
|
||||
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException
|
||||
{
|
||||
try (Reader reader = new InputStreamReader(stream, UTF_8))
|
||||
{
|
||||
xhtml.startDocument();
|
||||
xhtml.startElement("p");
|
||||
char[] buffer = new char[1024];
|
||||
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
|
||||
{
|
||||
xhtml.characters(buffer, 0, n);
|
||||
}
|
||||
xhtml.endElement("p");
|
||||
xhtml.endDocument();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser}
|
||||
* <p>
|
||||
* Starts a thread that sends the contents of the given input stream to the standard input stream of the given process. Potential exceptions are ignored, and the standard input stream is closed once fully processed. Note that the given input stream is <em>not</em> closed by this method.
|
||||
*
|
||||
* @param process
|
||||
* process
|
||||
* @param stream
|
||||
* input stream
|
||||
*/
|
||||
private void sendInput(final Process process, final InputStream stream)
|
||||
{
|
||||
Thread t = new Thread() {
|
||||
public void run()
|
||||
{
|
||||
OutputStream stdin = process.getOutputStream();
|
||||
try
|
||||
{
|
||||
IOUtils.copy(stream, stdin);
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
logger.error(e.getMessage());
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
try
|
||||
{
|
||||
t.join();
|
||||
}
|
||||
catch (InterruptedException ignore)
|
||||
{
|
||||
logger.error(ignore.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser}
|
||||
* <p>
|
||||
* Starts a thread that reads and discards the contents of the standard stream of the given process. Potential exceptions are ignored, and the stream is closed once fully processed.
|
||||
*
|
||||
* @param stream
|
||||
* stream
|
||||
*/
|
||||
private void ignoreStream(final InputStream stream)
|
||||
{
|
||||
Thread t = new Thread() {
|
||||
public void run()
|
||||
{
|
||||
try
|
||||
{
|
||||
IOUtils.copy(stream, NullOutputStream.NULL_OUTPUT_STREAM);
|
||||
}
|
||||
catch (IOException e)
|
||||
{}
|
||||
finally
|
||||
{
|
||||
IOUtils.closeQuietly(stream);
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
try
|
||||
{
|
||||
t.join();
|
||||
}
|
||||
catch (InterruptedException ignore)
|
||||
{}
|
||||
}
|
||||
|
||||
private void extractMetadata(final InputStream stream, final Metadata metadata)
|
||||
{
|
||||
Thread t = new Thread() {
|
||||
public void run()
|
||||
{
|
||||
BufferedReader reader;
|
||||
reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
|
||||
try
|
||||
{
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null)
|
||||
{
|
||||
for (Pattern p : getMetadataExtractionPatterns().keySet())
|
||||
{
|
||||
Matcher m = p.matcher(line);
|
||||
if (m.find())
|
||||
{
|
||||
if (getMetadataExtractionPatterns().get(p) != null
|
||||
&& !getMetadataExtractionPatterns().get(p).equals(""))
|
||||
{
|
||||
metadata.add(getMetadataExtractionPatterns().get(p), m.group(1));
|
||||
}
|
||||
else
|
||||
{
|
||||
metadata.add(m.group(1), m.group(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
// Ignore
|
||||
}
|
||||
finally
|
||||
{
|
||||
IOUtils.closeQuietly(reader);
|
||||
IOUtils.closeQuietly(stream);
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
try
|
||||
{
|
||||
t.join();
|
||||
}
|
||||
catch (InterruptedException ignore)
|
||||
{}
|
||||
}
|
||||
}
|
||||
|
@@ -47,14 +47,7 @@ import org.xml.sax.SAXException;
|
||||
///////// THIS FILE WAS A COPY OF THE CODE IN alfresco-repository /////////////
|
||||
|
||||
/**
|
||||
* <a href="http://tika.apache.org/Apache Tika">Apache Tika</a> assumes that
|
||||
* you either know exactly what your content is, or that
|
||||
* you'll leave it to auto-detection.
|
||||
* Within Alfresco, we usually do know. However, from time
|
||||
* to time, we don't know if we have one of the old or one
|
||||
* of the new office files (eg .xls and .xlsx).
|
||||
* This class allows automatically selects the appropriate
|
||||
* old (OLE2) or new (OOXML) Tika parser as required.
|
||||
* <a href="http://tika.apache.org/Apache Tika">Apache Tika</a> assumes that you either know exactly what your content is, or that you'll leave it to auto-detection. Within Alfresco, we usually do know. However, from time to time, we don't know if we have one of the old or one of the new office files (eg .xls and .xlsx). This class allows automatically selects the appropriate old (OLE2) or new (OOXML) Tika parser as required.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
@@ -72,9 +65,9 @@ public class TikaOfficeDetectParser implements Parser
|
||||
}
|
||||
|
||||
public void parse(InputStream stream,
|
||||
ContentHandler handler, Metadata metadata,
|
||||
ParseContext parseContext) throws IOException, SAXException,
|
||||
TikaException
|
||||
ContentHandler handler, Metadata metadata,
|
||||
ParseContext parseContext) throws IOException, SAXException,
|
||||
TikaException
|
||||
{
|
||||
byte[] initial4 = new byte[4];
|
||||
InputStream wrapped;
|
||||
@@ -109,8 +102,8 @@ public class TikaOfficeDetectParser implements Parser
|
||||
* @deprecated This method will be removed in Apache Tika 1.0.
|
||||
*/
|
||||
public void parse(InputStream stream,
|
||||
ContentHandler handler, Metadata metadata)
|
||||
throws IOException, SAXException, TikaException
|
||||
ContentHandler handler, Metadata metadata)
|
||||
throws IOException, SAXException, TikaException
|
||||
{
|
||||
parse(stream, handler, metadata, new ParseContext());
|
||||
}
|
||||
|
@@ -26,18 +26,7 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.alfresco.transform.base.CustomTransformer;
|
||||
import org.alfresco.transform.base.TransformManager;
|
||||
import org.alfresco.transform.base.executors.RuntimeExec;
|
||||
import org.alfresco.transform.base.logging.LogEntry;
|
||||
import org.alfresco.transform.common.RequestParamMap;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import static java.lang.Boolean.parseBoolean;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
@@ -47,7 +36,19 @@ import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import static java.lang.Boolean.parseBoolean;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
|
||||
import org.alfresco.transform.base.CustomTransformer;
|
||||
import org.alfresco.transform.base.TransformManager;
|
||||
import org.alfresco.transform.base.executors.RuntimeExec;
|
||||
import org.alfresco.transform.base.logging.LogEntry;
|
||||
import org.alfresco.transform.common.RequestParamMap;
|
||||
|
||||
public abstract class AbstractTikaTransformer implements CustomTransformer
|
||||
{
|
||||
@@ -85,7 +86,7 @@ public abstract class AbstractTikaTransformer implements CustomTransformer
|
||||
public String getTransformerName()
|
||||
{
|
||||
String simpleClassName = getClass().getSimpleName();
|
||||
return simpleClassName.substring(0, simpleClassName.length()-"Transformer".length());
|
||||
return simpleClassName.substring(0, simpleClassName.length() - "Transformer".length());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@@ -26,8 +26,33 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_PNG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_TIFF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_CSV;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.io.Writer;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.regex.Pattern;
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.TransformerConfigurationException;
|
||||
import javax.xml.transform.sax.SAXTransformerFactory;
|
||||
import javax.xml.transform.sax.TransformerHandler;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.alfresco.transform.tika.parsers.TikaOfficeDetectParser;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
@@ -49,31 +74,7 @@ import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.TransformerConfigurationException;
|
||||
import javax.xml.transform.sax.SAXTransformerFactory;
|
||||
import javax.xml.transform.sax.TransformerHandler;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.io.Writer;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_PNG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_TIFF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_CSV;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
|
||||
import org.alfresco.transform.tika.parsers.TikaOfficeDetectParser;
|
||||
|
||||
@Component
|
||||
public class Tika
|
||||
@@ -113,10 +114,9 @@ public class Tika
|
||||
public static final Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
|
||||
public final PDFParserConfig pdfParserConfig = new PDFParserConfig();
|
||||
|
||||
public static final DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
|
||||
{
|
||||
public static final DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector() {
|
||||
private final List<String> disabledMediaTypes = ImmutableList.of(MIMETYPE_IMAGE_JPEG,
|
||||
MIMETYPE_IMAGE_TIFF, MIMETYPE_IMAGE_PNG);
|
||||
MIMETYPE_IMAGE_TIFF, MIMETYPE_IMAGE_PNG);
|
||||
|
||||
@Override
|
||||
public boolean select(Metadata metadata)
|
||||
@@ -198,7 +198,7 @@ public class Tika
|
||||
notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText;
|
||||
|
||||
transform(parser, documentSelector, includeContents, notExtractBookmarksText, inputStream,
|
||||
outputStream, targetMimetype, targetEncoding);
|
||||
outputStream, targetMimetype, targetEncoding);
|
||||
}
|
||||
|
||||
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
|
||||
@@ -220,23 +220,23 @@ public class Tika
|
||||
}
|
||||
|
||||
private void transform(Parser parser, DocumentSelector documentSelector,
|
||||
Boolean includeContents,
|
||||
Boolean notExtractBookmarksText,
|
||||
InputStream inputStream,
|
||||
OutputStream outputStream, String targetMimetype, String targetEncoding)
|
||||
Boolean includeContents,
|
||||
Boolean notExtractBookmarksText,
|
||||
InputStream inputStream,
|
||||
OutputStream outputStream, String targetMimetype, String targetEncoding)
|
||||
{
|
||||
try (Writer ow = new BufferedWriter(new OutputStreamWriter(outputStream, targetEncoding)))
|
||||
{
|
||||
Metadata metadata = new Metadata();
|
||||
ParseContext context = buildParseContext(documentSelector, includeContents,
|
||||
notExtractBookmarksText);
|
||||
notExtractBookmarksText);
|
||||
ContentHandler handler = getContentHandler(targetMimetype, ow);
|
||||
|
||||
parser.parse(inputStream, handler, metadata, context);
|
||||
}
|
||||
catch (UnsupportedEncodingException e)
|
||||
{
|
||||
throw new IllegalArgumentException("Unsupported encoding "+e.getMessage(), e);
|
||||
throw new IllegalArgumentException("Unsupported encoding " + e.getMessage(), e);
|
||||
}
|
||||
catch (SAXException | TikaException | IOException e)
|
||||
{
|
||||
@@ -268,7 +268,7 @@ public class Tika
|
||||
return new ExpandedTitleContentHandler(transformerHandler);
|
||||
}
|
||||
else if (MIMETYPE_XHTML.equals(targetMimetype) ||
|
||||
MIMETYPE_XML.equals(targetMimetype))
|
||||
MIMETYPE_XML.equals(targetMimetype))
|
||||
{
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, XML);
|
||||
}
|
||||
@@ -307,7 +307,7 @@ public class Tika
|
||||
|
||||
@Override
|
||||
public void ignorableWhitespace(char[] ch, int start, int length)
|
||||
throws SAXException
|
||||
throws SAXException
|
||||
{
|
||||
if (length == 1 && ch[0] == '\t')
|
||||
{
|
||||
@@ -321,7 +321,7 @@ public class Tika
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length)
|
||||
throws SAXException
|
||||
throws SAXException
|
||||
{
|
||||
if (inCell)
|
||||
{
|
||||
@@ -357,7 +357,7 @@ public class Tika
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String name,
|
||||
Attributes atts) throws SAXException
|
||||
Attributes atts) throws SAXException
|
||||
{
|
||||
if (localName.equals("td"))
|
||||
{
|
||||
@@ -376,7 +376,7 @@ public class Tika
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String name)
|
||||
throws SAXException
|
||||
throws SAXException
|
||||
{
|
||||
if (localName.equals("td"))
|
||||
{
|
||||
@@ -395,7 +395,7 @@ public class Tika
|
||||
}
|
||||
|
||||
private ParseContext buildParseContext(DocumentSelector documentSelector,
|
||||
Boolean includeContents, Boolean notExtractBookmarksText)
|
||||
Boolean includeContents, Boolean notExtractBookmarksText)
|
||||
{
|
||||
ParseContext context = new ParseContext();
|
||||
|
||||
|
@@ -29,10 +29,11 @@ package org.alfresco.transform.tika;
|
||||
|
||||
import org.alfresco.transform.base.LivenessReadinessProbeTest;
|
||||
|
||||
|
||||
public class TikaLivenessReadinessProbeIT extends LivenessReadinessProbeTest {
|
||||
public class TikaLivenessReadinessProbeIT extends LivenessReadinessProbeTest
|
||||
{
|
||||
@Override
|
||||
protected ImagesForTests getImageForTest() {
|
||||
protected ImagesForTests getImageForTest()
|
||||
{
|
||||
return new ImagesForTests("alfresco-tika", "text/plain", "text/plain", "original.txt");
|
||||
}
|
||||
}
|
||||
|
@@ -26,13 +26,6 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import org.alfresco.transform.base.clients.FileInfo;
|
||||
import org.alfresco.transform.base.metadata.AbstractMetadataExtractsIT;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.alfresco.transform.base.clients.FileInfo.testFile;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_APP_DWG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_AUDIO_MP4;
|
||||
@@ -79,6 +72,14 @@ import static org.alfresco.transform.common.Mimetype.MIMETYPE_WORD;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_ZIP;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
import org.alfresco.transform.base.clients.FileInfo;
|
||||
import org.alfresco.transform.base.metadata.AbstractMetadataExtractsIT;
|
||||
|
||||
/**
|
||||
* Metadata integration tests in the Tika T-Engine.
|
||||
*
|
||||
@@ -105,7 +106,7 @@ public class TikaMetadataExtractsIT extends AbstractMetadataExtractsIT
|
||||
// either no quick file or the target extension has not been registered.
|
||||
|
||||
return Stream.of(
|
||||
//IPTCMetadataExtractor
|
||||
// IPTCMetadataExtractor
|
||||
testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quick.jpg"),
|
||||
testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quickIPTC-EXT.jpg"),
|
||||
testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quickIPTC-multi-creator.jpg"),
|
||||
@@ -117,7 +118,7 @@ public class TikaMetadataExtractsIT extends AbstractMetadataExtractsIT
|
||||
testFile(MIMETYPE_IMAGE_RAW_RW2, "rw2", "20140629_145035_Flower.RW2"),
|
||||
testFile(MIMETYPE_IMAGE_RAW_NEF, "nef", "20150408_074941_Bush.NEF"),
|
||||
testFile(MIMETYPE_IMAGE_RAW_RAF, "raf", "20160502_190928_London_Underground.RAF"),
|
||||
|
||||
|
||||
// DWGMetadataExtractor
|
||||
testFile(MIMETYPE_APP_DWG, "dwg", "quick2010CustomProps.dwg"),
|
||||
|
||||
@@ -129,97 +130,97 @@ public class TikaMetadataExtractsIT extends AbstractMetadataExtractsIT
|
||||
|
||||
// OfficeMetadataExtractor
|
||||
testFile(MIMETYPE_WORD, "doc", "quick.doc"),
|
||||
//testFile("application/x-tika-msoffice-embedded; format=ole10_native", "", ""),
|
||||
// testFile("application/x-tika-msoffice-embedded; format=ole10_native", "", ""),
|
||||
testFile(MIMETYPE_VISIO, "vsd", "quick.vsd"),
|
||||
//testFile("application/vnd.ms-project", "mpp", ""),
|
||||
//testFile("application/x-tika-msworks-spreadsheet", "", ""),
|
||||
//testFile("application/x-mspublisher", "", ""),
|
||||
// testFile("application/vnd.ms-project", "mpp", ""),
|
||||
// testFile("application/x-tika-msworks-spreadsheet", "", ""),
|
||||
// testFile("application/x-mspublisher", "", ""),
|
||||
testFile(MIMETYPE_PPT, "ppt", "quick.ppt"),
|
||||
//testFile("application/x-tika-msoffice", "", ""),
|
||||
//testFile(MIMETYPE_VISIO_2013, "vsdx", ""),
|
||||
//testFile("application/sldworks", "", ""),
|
||||
//testFile(MIMETYPE_ENCRYPTED_OFFICE, "", ""),
|
||||
// testFile("application/x-tika-msoffice", "", ""),
|
||||
// testFile(MIMETYPE_VISIO_2013, "vsdx", ""),
|
||||
// testFile("application/sldworks", "", ""),
|
||||
// testFile(MIMETYPE_ENCRYPTED_OFFICE, "", ""),
|
||||
testFile(MIMETYPE_EXCEL, "xls", "quick.xls"),
|
||||
|
||||
// OpenDocumentMetadataExtractor
|
||||
//testFile("application/x-vnd.oasis.opendocument.presentation", "", ""),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_CHART, "odc", ""),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_IMAGE_TEMPLATE, "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.text-web", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.image", "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.presentation", "", ""),
|
||||
// testFile(MIMETYPE_OPENDOCUMENT_CHART, "odc", ""),
|
||||
// testFile(MIMETYPE_OPENDOCUMENT_IMAGE_TEMPLATE, "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.text-web", "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.image", "", ""),
|
||||
testFile(MIMETYPE_OPENDOCUMENT_GRAPHICS_TEMPLATE, "otg", "quick.otg"),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_TEXT_WEB, "oth", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.spreadsheet-template", "", ""),
|
||||
// testFile(MIMETYPE_OPENDOCUMENT_TEXT_WEB, "oth", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.spreadsheet-template", "", ""),
|
||||
testFile(MIMETYPE_OPENDOCUMENT_SPREADSHEET_TEMPLATE, "ots", "quick.ots"),
|
||||
testFile(MIMETYPE_OPENOFFICE1_WRITER, "sxw", "quick.sxw"),
|
||||
//testFile("application/x-vnd.oasis.opendocument.graphics-template", "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.graphics-template", "", ""),
|
||||
testFile(MIMETYPE_OPENDOCUMENT_GRAPHICS, "odg", "quick.odg"),
|
||||
testFile(MIMETYPE_OPENDOCUMENT_SPREADSHEET, "ods", "quick.ods"),
|
||||
//testFile("application/x-vnd.oasis.opendocument.chart", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.spreadsheet", "", ""),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_IMAGE, "odi", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.text", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.text-template", "", ""),
|
||||
//testFile("application/vnd.oasis.opendocument.formula-template", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.formula", "", ""),
|
||||
//testFile("application/vnd.oasis.opendocument.image-template", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.image-template", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.presentation-template", "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.chart", "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.spreadsheet", "", ""),
|
||||
// testFile(MIMETYPE_OPENDOCUMENT_IMAGE, "odi", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.text", "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.text-template", "", ""),
|
||||
// testFile("application/vnd.oasis.opendocument.formula-template", "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.formula", "", ""),
|
||||
// testFile("application/vnd.oasis.opendocument.image-template", "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.image-template", "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.presentation-template", "", ""),
|
||||
testFile(MIMETYPE_OPENDOCUMENT_PRESENTATION_TEMPLATE, "otp", "quick.otp"),
|
||||
testFile(MIMETYPE_OPENDOCUMENT_TEXT, "odt", "quick.odt"),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_FORMULA_TEMPLATE, "", ""),
|
||||
// testFile(MIMETYPE_OPENDOCUMENT_FORMULA_TEMPLATE, "", ""),
|
||||
testFile(MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE, "ott", "quick.ott"),
|
||||
//testFile("application/vnd.oasis.opendocument.chart-template", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.chart-template", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.formula-template", "", ""),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_DATABASE, "odb", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.text-master", "", ""),
|
||||
// testFile("application/vnd.oasis.opendocument.chart-template", "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.chart-template", "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.formula-template", "", ""),
|
||||
// testFile(MIMETYPE_OPENDOCUMENT_DATABASE, "odb", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.text-master", "", ""),
|
||||
testFile(MIMETYPE_OPENDOCUMENT_PRESENTATION, "odp", "quick.odp"),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_CHART_TEMPLATE, "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.graphics", "", ""),
|
||||
// testFile(MIMETYPE_OPENDOCUMENT_CHART_TEMPLATE, "", ""),
|
||||
// testFile("application/x-vnd.oasis.opendocument.graphics", "", ""),
|
||||
testFile(MIMETYPE_OPENDOCUMENT_FORMULA, "odf", "quick.odf"),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_TEXT_MASTER, "odm", ""),
|
||||
// testFile(MIMETYPE_OPENDOCUMENT_TEXT_MASTER, "odm", ""),
|
||||
|
||||
// PdfBoxMetadataExtractor
|
||||
testFile(MIMETYPE_PDF, "pdf", "quick.pdf"),
|
||||
//testFile(MIMETYPE_APPLICATION_ILLUSTRATOR, "ai", ""),
|
||||
// testFile(MIMETYPE_APPLICATION_ILLUSTRATOR, "ai", ""),
|
||||
|
||||
// PoiMetadataExtractor
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_TEMPLATE_MACRO, "potm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_SPREADSHEET_ADDIN_MACRO, "xlam", ""),
|
||||
//testFile(MIMETYPE_OPENXML_WORD_TEMPLATE, "dotx", ""),
|
||||
//testFile(MIMETYPE_OPENXML_SPREADSHEET_BINARY_MACRO, "xlsb", ""),
|
||||
// testFile(MIMETYPE_OPENXML_PRESENTATION_TEMPLATE_MACRO, "potm", ""),
|
||||
// testFile(MIMETYPE_OPENXML_SPREADSHEET_ADDIN_MACRO, "xlam", ""),
|
||||
// testFile(MIMETYPE_OPENXML_WORD_TEMPLATE, "dotx", ""),
|
||||
// testFile(MIMETYPE_OPENXML_SPREADSHEET_BINARY_MACRO, "xlsb", ""),
|
||||
testFile(MIMETYPE_OPENXML_WORDPROCESSING, "docx", "quick.docx"),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDE_MACRO, "sldm", ""),
|
||||
//testFile("application/vnd.ms-visio.drawing", "", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDESHOW_MACRO, "ppsm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_MACRO, "pptm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDE, "sldx", ""),
|
||||
//testFile(MIMETYPE_OPENXML_SPREADSHEET_MACRO, "xlsm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_WORD_TEMPLATE_MACRO, "dotm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_WORDPROCESSING_MACRO, "docm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_ADDIN, "ppam", ""),
|
||||
//testFile(MIMETYPE_OPENXML_SPREADSHEET_TEMPLATE, "xltx", ""),
|
||||
//testFile("application/vnd.ms-xpsdocument", "", ""),
|
||||
//testFile("application/vnd.ms-visio.drawing.macroenabled.12", "", ""),
|
||||
//testFile("application/vnd.ms-visio.template.macroenabled.12", "", ""),
|
||||
//testFile("model/vnd.dwfx+xps", "", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_TEMPLATE, "potx", ""),
|
||||
// testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDE_MACRO, "sldm", ""),
|
||||
// testFile("application/vnd.ms-visio.drawing", "", ""),
|
||||
// testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDESHOW_MACRO, "ppsm", ""),
|
||||
// testFile(MIMETYPE_OPENXML_PRESENTATION_MACRO, "pptm", ""),
|
||||
// testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDE, "sldx", ""),
|
||||
// testFile(MIMETYPE_OPENXML_SPREADSHEET_MACRO, "xlsm", ""),
|
||||
// testFile(MIMETYPE_OPENXML_WORD_TEMPLATE_MACRO, "dotm", ""),
|
||||
// testFile(MIMETYPE_OPENXML_WORDPROCESSING_MACRO, "docm", ""),
|
||||
// testFile(MIMETYPE_OPENXML_PRESENTATION_ADDIN, "ppam", ""),
|
||||
// testFile(MIMETYPE_OPENXML_SPREADSHEET_TEMPLATE, "xltx", ""),
|
||||
// testFile("application/vnd.ms-xpsdocument", "", ""),
|
||||
// testFile("application/vnd.ms-visio.drawing.macroenabled.12", "", ""),
|
||||
// testFile("application/vnd.ms-visio.template.macroenabled.12", "", ""),
|
||||
// testFile("model/vnd.dwfx+xps", "", ""),
|
||||
// testFile(MIMETYPE_OPENXML_PRESENTATION_TEMPLATE, "potx", ""),
|
||||
testFile(MIMETYPE_OPENXML_PRESENTATION, "pptx", "quick.pptx"),
|
||||
testFile(MIMETYPE_OPENXML_SPREADSHEET, "xlsx", "quick.xlsx"),
|
||||
//testFile("application/vnd.ms-visio.stencil", "", ""),
|
||||
//testFile("application/vnd.ms-visio.template", "", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDESHOW, "ppsx", ""),
|
||||
//testFile("application/vnd.ms-visio.stencil.macroenabled.12", "", ""),
|
||||
//testFile(MIMETYPE_OPENXML_SPREADSHEET_TEMPLATE_MACRO, "xltm", ""),
|
||||
// testFile("application/vnd.ms-visio.stencil", "", ""),
|
||||
// testFile("application/vnd.ms-visio.template", "", ""),
|
||||
// testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDESHOW, "ppsx", ""),
|
||||
// testFile("application/vnd.ms-visio.stencil.macroenabled.12", "", ""),
|
||||
// testFile(MIMETYPE_OPENXML_SPREADSHEET_TEMPLATE_MACRO, "xltm", ""),
|
||||
|
||||
// TikaAudioMetadataExtractor
|
||||
testFile("video/x-m4v", "m4v", "quick.m4v"),
|
||||
//testFile("audio/x-oggflac", "", ""),
|
||||
//testFile("application/mp4", "", ""),
|
||||
// testFile("audio/x-oggflac", "", ""),
|
||||
// testFile("application/mp4", "", ""),
|
||||
testFile(MIMETYPE_VORBIS, "ogg", "quick.ogg"),
|
||||
testFile(MIMETYPE_VIDEO_3GP, "3gp", "quick.3gp"),
|
||||
//testFile(MIMETYPE_FLAC, "flac", ""),
|
||||
// testFile(MIMETYPE_FLAC, "flac", ""),
|
||||
testFile(MIMETYPE_VIDEO_3GP2, "3g2", "quick.3g2"),
|
||||
testFile(MIMETYPE_VIDEO_QUICKTIME, "mov", "quick.mov"),
|
||||
testFile(MIMETYPE_AUDIO_MP4, "m4a", "quick.m4a"),
|
||||
@@ -238,304 +239,304 @@ public class TikaMetadataExtractsIT extends AbstractMetadataExtractsIT
|
||||
// contain one or more values, but also include nulls. Again this may be correct, a bug or just the
|
||||
// example quick file rather than a problem with the extractor.
|
||||
|
||||
//testFile("application/vnd.ms-htmlhelp", "", ""),
|
||||
//testFile(MIMETYPE_ATOM, "", ""),
|
||||
//testFile("audio/midi", "", ""),
|
||||
//testFile("application/aaigrid", "", ""),
|
||||
//testFile("application/x-bag", "", ""),
|
||||
// testFile("application/vnd.ms-htmlhelp", "", ""),
|
||||
// testFile(MIMETYPE_ATOM, "", ""),
|
||||
// testFile("audio/midi", "", ""),
|
||||
// testFile("application/aaigrid", "", ""),
|
||||
// testFile("application/x-bag", "", ""),
|
||||
testFile(MIMETYPE_IWORK_KEYNOTE, "key", "quick.key"),
|
||||
//testFile("application/x-quattro-pro; version=9", "", ""),
|
||||
//testFile("application/x-ibooks+zip", "", ""),
|
||||
//testFile("audio/wave", "", ""),
|
||||
//testFile("application/x-midi", "", ""),
|
||||
// testFile("application/x-quattro-pro; version=9", "", ""),
|
||||
// testFile("application/x-ibooks+zip", "", ""),
|
||||
// testFile("audio/wave", "", ""),
|
||||
// testFile("application/x-midi", "", ""),
|
||||
testFile(MIMETYPE_XML, "xml", "quick.xml"),
|
||||
//testFile(MIMETYPE_RSS, "rss", ""),
|
||||
//testFile("application/x-netcdf", "cdf", ""),
|
||||
//testFile("video/x-daala", "", ""),
|
||||
//testFile("application/matlab-mat", "", ""),
|
||||
//testFile("audio/aiff", "", ""),
|
||||
//testFile("application/jaxa-pal-sar", "", ""),
|
||||
//testFile("image/x-pcraster", "", ""),
|
||||
//testFile("image/arg", "", ""),
|
||||
//testFile("application/x-kro", "", ""),
|
||||
//testFile("image/x-hdf5-image", "", ""),
|
||||
//testFile("audio/speex", "", ""),
|
||||
//testFile("image/big-gif", "", ""),
|
||||
//testFile("application/zlib", "", ""),
|
||||
//testFile("application/x-cosar", "", ""),
|
||||
//testFile("application/x-ntv2", "", ""),
|
||||
//testFile("application/x-archive", "", ""),
|
||||
//testFile("application/java-archive", "jar", ""),
|
||||
//testFile("application/x-vnd.sun.xml.writer", "", ""),
|
||||
//testFile("application/x-gmt", "", ""),
|
||||
//testFile("application/x-xml", "", ""),
|
||||
//testFile("application/gzip-compressed", "", ""),
|
||||
//testFile("image/ida", "", ""),
|
||||
//testFile("text/x-groovy", "", ""),
|
||||
//testFile("image/x-emf", "", ""),
|
||||
//testFile("application/x-rar", "", ""),
|
||||
//testFile("image/sar-ceos", "", ""),
|
||||
//testFile("application/acad", "", ""),
|
||||
// testFile(MIMETYPE_RSS, "rss", ""),
|
||||
// testFile("application/x-netcdf", "cdf", ""),
|
||||
// testFile("video/x-daala", "", ""),
|
||||
// testFile("application/matlab-mat", "", ""),
|
||||
// testFile("audio/aiff", "", ""),
|
||||
// testFile("application/jaxa-pal-sar", "", ""),
|
||||
// testFile("image/x-pcraster", "", ""),
|
||||
// testFile("image/arg", "", ""),
|
||||
// testFile("application/x-kro", "", ""),
|
||||
// testFile("image/x-hdf5-image", "", ""),
|
||||
// testFile("audio/speex", "", ""),
|
||||
// testFile("image/big-gif", "", ""),
|
||||
// testFile("application/zlib", "", ""),
|
||||
// testFile("application/x-cosar", "", ""),
|
||||
// testFile("application/x-ntv2", "", ""),
|
||||
// testFile("application/x-archive", "", ""),
|
||||
// testFile("application/java-archive", "jar", ""),
|
||||
// testFile("application/x-vnd.sun.xml.writer", "", ""),
|
||||
// testFile("application/x-gmt", "", ""),
|
||||
// testFile("application/x-xml", "", ""),
|
||||
// testFile("application/gzip-compressed", "", ""),
|
||||
// testFile("image/ida", "", ""),
|
||||
// testFile("text/x-groovy", "", ""),
|
||||
// testFile("image/x-emf", "", ""),
|
||||
// testFile("application/x-rar", "", ""),
|
||||
// testFile("image/sar-ceos", "", ""),
|
||||
// testFile("application/acad", "", ""),
|
||||
testFile(MIMETYPE_ZIP, "zip", "quick.zip"),
|
||||
//testFile(MIMETYPE_IMAGE_PSD, "psd", ""),
|
||||
//testFile("application/x-sharedlib", "", ""),
|
||||
//testFile("audio/x-m4a", "", ""),
|
||||
//testFile("image/webp", "", ""),
|
||||
//testFile("application/vnd.wap.xhtml+xml", "", ""),
|
||||
//testFile("audio/x-aiff", "aiff", ""),
|
||||
//testFile("application/vnd.ms-spreadsheetml", "", ""),
|
||||
//testFile("image/x-airsar", "", ""),
|
||||
//testFile("application/x-pcidsk", "", ""),
|
||||
//testFile("application/x-java-pack200", "", ""),
|
||||
//testFile("image/x-fujibas", "", ""),
|
||||
//testFile("application/x-zmap", "", ""),
|
||||
//testFile("image/x-bmp", "", ""),
|
||||
//testFile("image/bpg", "", ""),
|
||||
//testFile(MIMETYPE_RTF, "rtf", ""),
|
||||
//testFile("application/x-xz", "", ""),
|
||||
//testFile("application/x-speex", "", ""),
|
||||
//testFile("audio/ogg; codecs=speex", "", ""),
|
||||
//testFile("application/x-l1b", "", ""),
|
||||
//testFile("application/x-gsbg", "", ""),
|
||||
//testFile("application/x-sdat", "", ""),
|
||||
//testFile("application/vnd.ms-visio", "", ""),
|
||||
//testFile("application/x-coredump", "", ""),
|
||||
//testFile("application/x-msaccess", "", ""),
|
||||
//testFile("application/x-dods", "", ""),
|
||||
// testFile(MIMETYPE_IMAGE_PSD, "psd", ""),
|
||||
// testFile("application/x-sharedlib", "", ""),
|
||||
// testFile("audio/x-m4a", "", ""),
|
||||
// testFile("image/webp", "", ""),
|
||||
// testFile("application/vnd.wap.xhtml+xml", "", ""),
|
||||
// testFile("audio/x-aiff", "aiff", ""),
|
||||
// testFile("application/vnd.ms-spreadsheetml", "", ""),
|
||||
// testFile("image/x-airsar", "", ""),
|
||||
// testFile("application/x-pcidsk", "", ""),
|
||||
// testFile("application/x-java-pack200", "", ""),
|
||||
// testFile("image/x-fujibas", "", ""),
|
||||
// testFile("application/x-zmap", "", ""),
|
||||
// testFile("image/x-bmp", "", ""),
|
||||
// testFile("image/bpg", "", ""),
|
||||
// testFile(MIMETYPE_RTF, "rtf", ""),
|
||||
// testFile("application/x-xz", "", ""),
|
||||
// testFile("application/x-speex", "", ""),
|
||||
// testFile("audio/ogg; codecs=speex", "", ""),
|
||||
// testFile("application/x-l1b", "", ""),
|
||||
// testFile("application/x-gsbg", "", ""),
|
||||
// testFile("application/x-sdat", "", ""),
|
||||
// testFile("application/vnd.ms-visio", "", ""),
|
||||
// testFile("application/x-coredump", "", ""),
|
||||
// testFile("application/x-msaccess", "", ""),
|
||||
// testFile("application/x-dods", "", ""),
|
||||
testFile(MIMETYPE_IMAGE_PNG, "png", "quick.png"),
|
||||
//testFile("application/vnd.ms-outlook-pst", "", ""),
|
||||
//testFile("image/bsb", "", ""),
|
||||
//testFile("application/x-cpio", "cpio", ""),
|
||||
//testFile("audio/ogg", "oga", ""),
|
||||
// testFile("application/vnd.ms-outlook-pst", "", ""),
|
||||
// testFile("image/bsb", "", ""),
|
||||
// testFile("application/x-cpio", "cpio", ""),
|
||||
// testFile("audio/ogg", "oga", ""),
|
||||
testFile("application/x-tar", "tar", "quick.tar"),
|
||||
//testFile("application/x-dbf", "", ""),
|
||||
//testFile("video/x-ogm", "", ""),
|
||||
//testFile("application/x-los-las", "", ""),
|
||||
//testFile("application/autocad_dwg", "", ""),
|
||||
//testFile("application/vnd.ms-excel.workspace.3", "", ""),
|
||||
//testFile("application/vnd.ms-excel.workspace.4", "", ""),
|
||||
//testFile("image/x-bpg", "", ""),
|
||||
//testFile("gzip/document", "", ""),
|
||||
//testFile("text/x-java", "", ""),
|
||||
//testFile("application/x-brotli", "", ""),
|
||||
//testFile("application/elas", "", ""),
|
||||
//testFile("image/x-jb2", "", ""),
|
||||
//testFile("application/x-cappi", "", ""),
|
||||
//testFile("application/epub+zip", "", ""),
|
||||
//testFile("application/x-ace2", "", ""),
|
||||
//testFile("application/x-sas-data", "", ""),
|
||||
//testFile("application/x-hdf", "hdf", ""),
|
||||
//testFile("image/x-mff", "", ""),
|
||||
//testFile("image/x-srp", "", ""),
|
||||
// testFile("application/x-dbf", "", ""),
|
||||
// testFile("video/x-ogm", "", ""),
|
||||
// testFile("application/x-los-las", "", ""),
|
||||
// testFile("application/autocad_dwg", "", ""),
|
||||
// testFile("application/vnd.ms-excel.workspace.3", "", ""),
|
||||
// testFile("application/vnd.ms-excel.workspace.4", "", ""),
|
||||
// testFile("image/x-bpg", "", ""),
|
||||
// testFile("gzip/document", "", ""),
|
||||
// testFile("text/x-java", "", ""),
|
||||
// testFile("application/x-brotli", "", ""),
|
||||
// testFile("application/elas", "", ""),
|
||||
// testFile("image/x-jb2", "", ""),
|
||||
// testFile("application/x-cappi", "", ""),
|
||||
// testFile("application/epub+zip", "", ""),
|
||||
// testFile("application/x-ace2", "", ""),
|
||||
// testFile("application/x-sas-data", "", ""),
|
||||
// testFile("application/x-hdf", "hdf", ""),
|
||||
// testFile("image/x-mff", "", ""),
|
||||
// testFile("image/x-srp", "", ""),
|
||||
testFile(MIMETYPE_IMAGE_BMP, "bmp", "quick.bmp"),
|
||||
//testFile("video/x-ogguvs", "", ""),
|
||||
//testFile("drawing/dwg", "", ""),
|
||||
//testFile("application/x-doq2", "", ""),
|
||||
//testFile("application/x-acad", "", ""),
|
||||
//testFile("application/x-kml", "", ""),
|
||||
//testFile("application/x-autocad", "", ""),
|
||||
//testFile("image/x-mff2", "", ""),
|
||||
//testFile("application/x-snodas", "", ""),
|
||||
//testFile("application/terragen", "", ""),
|
||||
//testFile("application/x-wcs", "", ""),
|
||||
//testFile("text/x-c++src", "", ""),
|
||||
//testFile("application/timestamped-data", "", ""),
|
||||
// testFile("video/x-ogguvs", "", ""),
|
||||
// testFile("drawing/dwg", "", ""),
|
||||
// testFile("application/x-doq2", "", ""),
|
||||
// testFile("application/x-acad", "", ""),
|
||||
// testFile("application/x-kml", "", ""),
|
||||
// testFile("application/x-autocad", "", ""),
|
||||
// testFile("image/x-mff2", "", ""),
|
||||
// testFile("application/x-snodas", "", ""),
|
||||
// testFile("application/terragen", "", ""),
|
||||
// testFile("application/x-wcs", "", ""),
|
||||
// testFile("text/x-c++src", "", ""),
|
||||
// testFile("application/timestamped-data", "", ""),
|
||||
testFile(MIMETYPE_IMAGE_TIFF, "tiff", "quick.tiff"),
|
||||
//testFile("application/msexcel", "", ""),
|
||||
//testFile("application/x-asp", "", ""),
|
||||
//testFile("application/x-rar-compressed", "rar", ""),
|
||||
//testFile("application/x-envi-hdr", "", ""),
|
||||
//testFile("text/iso19139+xml", "", ""),
|
||||
//testFile("application/vnd.ms-tnef", "", ""),
|
||||
//testFile("application/x-ecrg-toc", "", ""),
|
||||
//testFile("application/aig", "", ""),
|
||||
//testFile("audio/x-wav", "wav", ""),
|
||||
//testFile("image/emf", "", ""),
|
||||
//testFile("application/x-bzip", "", ""),
|
||||
//testFile("application/jdem", "", ""),
|
||||
//testFile("application/x-webp", "", ""),
|
||||
//testFile("application/x-arj", "", ""),
|
||||
//testFile("application/x-lzma", "", ""),
|
||||
//testFile("application/x-java-vm", "", ""),
|
||||
//testFile("image/envisat", "", ""),
|
||||
//testFile("application/x-doq1", "", ""),
|
||||
//testFile("audio/vnd.wave", "", ""),
|
||||
//testFile("application/x-ppi", "", ""),
|
||||
//testFile("image/ilwis", "", ""),
|
||||
//testFile("application/x-gunzip", "", ""),
|
||||
//testFile("image/x-icon", "", ""),
|
||||
//testFile("application/ogg", "ogx", ""),
|
||||
//testFile(MIMETYPE_IMAGE_SVG, "svg", ""),
|
||||
//testFile("application/x-ms-owner", "", ""),
|
||||
//testFile("application/x-grib", "", ""),
|
||||
//testFile("application/ms-tnef", "", ""),
|
||||
//testFile("image/fits", "", ""),
|
||||
//testFile("audio/x-mpeg", "", ""),
|
||||
//testFile("application/x-bzip2", "", ""),
|
||||
//testFile("text/tsv", "", ""),
|
||||
//testFile("application/x-fictionbook+xml", "", ""),
|
||||
//testFile("application/x-p-aux", "", ""),
|
||||
//testFile("application/x-font-ttf", "", ""),
|
||||
//testFile("image/x-xcf", "", ""),
|
||||
//testFile("image/x-ms-bmp", "", ""),
|
||||
//testFile("image/wmf", "", ""),
|
||||
//testFile("image/eir", "", ""),
|
||||
//testFile("application/x-matlab-data", "", ""),
|
||||
//testFile("application/deflate64", "", ""),
|
||||
//testFile("audio/wav", "", ""),
|
||||
//testFile("application/x-rs2", "", ""),
|
||||
//testFile("application/vnd.ms-word", "", ""),
|
||||
//testFile("application/x-tsx", "", ""),
|
||||
//testFile("application/x-lcp", "", ""),
|
||||
//testFile("application/x-mbtiles", "", ""),
|
||||
//testFile("audio/x-oggpcm", "", ""),
|
||||
//testFile("application/x-epsilon", "", ""),
|
||||
//testFile("application/x-msgn", "", ""),
|
||||
//testFile(MIMETYPE_TEXT_CSV, "csv", ""),
|
||||
//testFile("image/x-dimap", "", ""),
|
||||
//testFile("image/vnd.microsoft.icon", "", ""),
|
||||
//testFile("application/x-envi", "", ""),
|
||||
//testFile("application/x-dwg", "", ""),
|
||||
// testFile("application/msexcel", "", ""),
|
||||
// testFile("application/x-asp", "", ""),
|
||||
// testFile("application/x-rar-compressed", "rar", ""),
|
||||
// testFile("application/x-envi-hdr", "", ""),
|
||||
// testFile("text/iso19139+xml", "", ""),
|
||||
// testFile("application/vnd.ms-tnef", "", ""),
|
||||
// testFile("application/x-ecrg-toc", "", ""),
|
||||
// testFile("application/aig", "", ""),
|
||||
// testFile("audio/x-wav", "wav", ""),
|
||||
// testFile("image/emf", "", ""),
|
||||
// testFile("application/x-bzip", "", ""),
|
||||
// testFile("application/jdem", "", ""),
|
||||
// testFile("application/x-webp", "", ""),
|
||||
// testFile("application/x-arj", "", ""),
|
||||
// testFile("application/x-lzma", "", ""),
|
||||
// testFile("application/x-java-vm", "", ""),
|
||||
// testFile("image/envisat", "", ""),
|
||||
// testFile("application/x-doq1", "", ""),
|
||||
// testFile("audio/vnd.wave", "", ""),
|
||||
// testFile("application/x-ppi", "", ""),
|
||||
// testFile("image/ilwis", "", ""),
|
||||
// testFile("application/x-gunzip", "", ""),
|
||||
// testFile("image/x-icon", "", ""),
|
||||
// testFile("application/ogg", "ogx", ""),
|
||||
// testFile(MIMETYPE_IMAGE_SVG, "svg", ""),
|
||||
// testFile("application/x-ms-owner", "", ""),
|
||||
// testFile("application/x-grib", "", ""),
|
||||
// testFile("application/ms-tnef", "", ""),
|
||||
// testFile("image/fits", "", ""),
|
||||
// testFile("audio/x-mpeg", "", ""),
|
||||
// testFile("application/x-bzip2", "", ""),
|
||||
// testFile("text/tsv", "", ""),
|
||||
// testFile("application/x-fictionbook+xml", "", ""),
|
||||
// testFile("application/x-p-aux", "", ""),
|
||||
// testFile("application/x-font-ttf", "", ""),
|
||||
// testFile("image/x-xcf", "", ""),
|
||||
// testFile("image/x-ms-bmp", "", ""),
|
||||
// testFile("image/wmf", "", ""),
|
||||
// testFile("image/eir", "", ""),
|
||||
// testFile("application/x-matlab-data", "", ""),
|
||||
// testFile("application/deflate64", "", ""),
|
||||
// testFile("audio/wav", "", ""),
|
||||
// testFile("application/x-rs2", "", ""),
|
||||
// testFile("application/vnd.ms-word", "", ""),
|
||||
// testFile("application/x-tsx", "", ""),
|
||||
// testFile("application/x-lcp", "", ""),
|
||||
// testFile("application/x-mbtiles", "", ""),
|
||||
// testFile("audio/x-oggpcm", "", ""),
|
||||
// testFile("application/x-epsilon", "", ""),
|
||||
// testFile("application/x-msgn", "", ""),
|
||||
// testFile(MIMETYPE_TEXT_CSV, "csv", ""),
|
||||
// testFile("image/x-dimap", "", ""),
|
||||
// testFile("image/vnd.microsoft.icon", "", ""),
|
||||
// testFile("application/x-envi", "", ""),
|
||||
// testFile("application/x-dwg", "", ""),
|
||||
testFile(MIMETYPE_IWORK_NUMBERS, "numbers", "quick.numbers"),
|
||||
//testFile("application/vnd.ms-word2006ml", "", ""),
|
||||
//testFile("application/x-bt", "", ""),
|
||||
//testFile("application/x-font-adobe-metric", "", ""),
|
||||
//testFile("application/x-rst", "", ""),
|
||||
//testFile("application/vrt", "", ""),
|
||||
//testFile("application/x-ctg", "", ""),
|
||||
//testFile("application/x-e00-grid", "", ""),
|
||||
//testFile("audio/x-ogg-flac", "", ""),
|
||||
//testFile("application/x-compress", "z", ""),
|
||||
//testFile("image/x-psd", "", ""),
|
||||
//testFile("text/rss", "", ""),
|
||||
//testFile("application/sdts-raster", "", ""),
|
||||
//testFile("application/oxps", "", ""),
|
||||
//testFile("application/leveller", "", ""),
|
||||
//testFile("application/x-ingr", "", ""),
|
||||
//testFile("image/sgi", "", ""),
|
||||
//testFile("application/x-pnm", "", ""),
|
||||
//testFile("image/raster", "", ""),
|
||||
//testFile("audio/x-ogg-pcm", "", ""),
|
||||
//testFile("audio/ogg; codecs=opus", "", ""),
|
||||
//testFile("application/fits", "", ""),
|
||||
//testFile("application/x-r", "", ""),
|
||||
// testFile("application/vnd.ms-word2006ml", "", ""),
|
||||
// testFile("application/x-bt", "", ""),
|
||||
// testFile("application/x-font-adobe-metric", "", ""),
|
||||
// testFile("application/x-rst", "", ""),
|
||||
// testFile("application/vrt", "", ""),
|
||||
// testFile("application/x-ctg", "", ""),
|
||||
// testFile("application/x-e00-grid", "", ""),
|
||||
// testFile("audio/x-ogg-flac", "", ""),
|
||||
// testFile("application/x-compress", "z", ""),
|
||||
// testFile("image/x-psd", "", ""),
|
||||
// testFile("text/rss", "", ""),
|
||||
// testFile("application/sdts-raster", "", ""),
|
||||
// testFile("application/oxps", "", ""),
|
||||
// testFile("application/leveller", "", ""),
|
||||
// testFile("application/x-ingr", "", ""),
|
||||
// testFile("image/sgi", "", ""),
|
||||
// testFile("application/x-pnm", "", ""),
|
||||
// testFile("image/raster", "", ""),
|
||||
// testFile("audio/x-ogg-pcm", "", ""),
|
||||
// testFile("audio/ogg; codecs=opus", "", ""),
|
||||
// testFile("application/fits", "", ""),
|
||||
// testFile("application/x-r", "", ""),
|
||||
testFile(MIMETYPE_IMAGE_GIF, "gif", "quick.gif"),
|
||||
//testFile("application/java-vm", "", ""),
|
||||
//testFile("application/mspowerpoint", "", ""),
|
||||
//testFile("application/x-http", "", ""),
|
||||
//testFile("application/x-rmf", "", ""),
|
||||
//testFile("application/x-ogg", "", ""),
|
||||
//testFile("video/ogg", "ogv", "quick.ogv"),
|
||||
//testFile(MIMETYPE_APPLEFILE, "", ""),
|
||||
//testFile("text/rtf", "", ""),
|
||||
//testFile("image/adrg", "", ""),
|
||||
//testFile("video/x-ogg-rgb", "", ""),
|
||||
//testFile("application/x-ngs-geoid", "", ""),
|
||||
//testFile("application/x-map", "", ""),
|
||||
//testFile("image/ceos", "", ""),
|
||||
//testFile("application/xpm", "", ""),
|
||||
//testFile("application/x-ers", "", ""),
|
||||
//testFile("video/x-ogg-yuv", "", ""),
|
||||
//testFile("application/x-isis2", "", ""),
|
||||
//testFile("application/x-nwt-grd", "", ""),
|
||||
//testFile("application/x-isis3", "", ""),
|
||||
//testFile("application/x-nwt-grc", "", ""),
|
||||
//testFile("video/daala", "", ""),
|
||||
//testFile("application/x-blx", "", ""),
|
||||
//testFile("application/x-tnef", "", ""),
|
||||
//testFile("video/x-dirac", "", ""),
|
||||
//testFile("application/x-ndf", "", ""),
|
||||
//testFile("image/vnd.wap.wbmp", "", ""),
|
||||
//testFile("video/theora", "", ""),
|
||||
//testFile("application/kate", "", ""),
|
||||
//testFile("application/pkcs7-mime", "", ""),
|
||||
//testFile("image/fit", "", ""),
|
||||
//testFile("application/x-ctable2", "", ""),
|
||||
//testFile("application/x-executable", "", ""),
|
||||
//testFile("application/x-isatab", "", ""),
|
||||
//testFile("application/grass-ascii-grid", "", ""),
|
||||
// testFile("application/java-vm", "", ""),
|
||||
// testFile("application/mspowerpoint", "", ""),
|
||||
// testFile("application/x-http", "", ""),
|
||||
// testFile("application/x-rmf", "", ""),
|
||||
// testFile("application/x-ogg", "", ""),
|
||||
// testFile("video/ogg", "ogv", "quick.ogv"),
|
||||
// testFile(MIMETYPE_APPLEFILE, "", ""),
|
||||
// testFile("text/rtf", "", ""),
|
||||
// testFile("image/adrg", "", ""),
|
||||
// testFile("video/x-ogg-rgb", "", ""),
|
||||
// testFile("application/x-ngs-geoid", "", ""),
|
||||
// testFile("application/x-map", "", ""),
|
||||
// testFile("image/ceos", "", ""),
|
||||
// testFile("application/xpm", "", ""),
|
||||
// testFile("application/x-ers", "", ""),
|
||||
// testFile("video/x-ogg-yuv", "", ""),
|
||||
// testFile("application/x-isis2", "", ""),
|
||||
// testFile("application/x-nwt-grd", "", ""),
|
||||
// testFile("application/x-isis3", "", ""),
|
||||
// testFile("application/x-nwt-grc", "", ""),
|
||||
// testFile("video/daala", "", ""),
|
||||
// testFile("application/x-blx", "", ""),
|
||||
// testFile("application/x-tnef", "", ""),
|
||||
// testFile("video/x-dirac", "", ""),
|
||||
// testFile("application/x-ndf", "", ""),
|
||||
// testFile("image/vnd.wap.wbmp", "", ""),
|
||||
// testFile("video/theora", "", ""),
|
||||
// testFile("application/kate", "", ""),
|
||||
// testFile("application/pkcs7-mime", "", ""),
|
||||
// testFile("image/fit", "", ""),
|
||||
// testFile("application/x-ctable2", "", ""),
|
||||
// testFile("application/x-executable", "", ""),
|
||||
// testFile("application/x-isatab", "", ""),
|
||||
// testFile("application/grass-ascii-grid", "", ""),
|
||||
testFile(MIMETYPE_TEXT_PLAIN, "txt", "quick.txt"),
|
||||
//testFile("application/gzipped", "", ""),
|
||||
//testFile("application/x-gxf", "", ""),
|
||||
//testFile("application/x-cpg", "", ""),
|
||||
//testFile("application/x-lan", "", ""),
|
||||
//testFile("application/x-xyz", "", ""),
|
||||
// testFile("application/gzipped", "", ""),
|
||||
// testFile("application/x-gxf", "", ""),
|
||||
// testFile("application/x-cpg", "", ""),
|
||||
// testFile("application/x-lan", "", ""),
|
||||
// testFile("application/x-xyz", "", ""),
|
||||
testFile(MIMETYPE_IWORK_PAGES, "pages", "quick.pages"),
|
||||
//testFile("image/x-jbig2", "", ""),
|
||||
//testFile("image/nitf", "", ""),
|
||||
//testFile("application/mbox", "", ""),
|
||||
//testFile("application/chm", "", ""),
|
||||
//testFile("application/x-fast", "", ""),
|
||||
//testFile("application/x-gsc", "", ""),
|
||||
//testFile("application/x-deflate", "", ""),
|
||||
//testFile("application/x-grib2", "", ""),
|
||||
//testFile("image/x-ozi", "", ""),
|
||||
//testFile("application/x-pds", "", ""),
|
||||
//testFile("application/vnd.apple.iwork", "", ""),
|
||||
//testFile("application/x-usgs-dem", "", ""),
|
||||
//testFile("application/vnd.ms-excel.sheet.2", "", ""),
|
||||
//testFile("application/vnd.ms-excel.sheet.3", "", ""),
|
||||
//testFile("application/dif+xml", "", ""),
|
||||
//testFile("application/vnd.ms-excel.sheet.4", "", ""),
|
||||
//testFile("application/x-java", "", ""),
|
||||
//testFile("image/geotiff", "", ""),
|
||||
//testFile("application/x-gsag", "", ""),
|
||||
//testFile("application/x-snappy", "", ""),
|
||||
//testFile("video/x-theora", "", ""),
|
||||
//testFile("image/ntf", "", ""),
|
||||
//testFile("application/x-pdf", "", ""),
|
||||
//testFile("application/xml", "", ""),
|
||||
//testFile("application/vnd.wordperfect; version=6.x", "", ""),
|
||||
//testFile("application/pkcs7-signature", "", ""),
|
||||
//testFile("application/vnd.wordperfect; version=5.1", "", ""),
|
||||
//testFile("application/vnd.wordperfect; version=5.0", "", ""),
|
||||
//testFile("application/x-arj-compressed", "", ""),
|
||||
//testFile("application/geotopic", "", ""),
|
||||
//testFile("text/x-java-source", "java", ""),
|
||||
//testFile("audio/basic", "au", ""),
|
||||
//testFile("application/pcisdk", "", ""),
|
||||
//testFile("application/x-rik", "", ""),
|
||||
//testFile("audio/opus", "", ""),
|
||||
//testFile(MIMETYPE_IMAGE_JP2, "jp2", ""),
|
||||
//testFile("application/x-gtx", "", ""),
|
||||
//testFile("application/x-object", "", ""),
|
||||
//testFile("application/vnd.ms-wordml", "", ""),
|
||||
//testFile("image/x-wmf", "", ""),
|
||||
//testFile("application/x-rpf-toc", "", ""),
|
||||
//testFile("application/x-srtmhgt", "", ""),
|
||||
//testFile("application/x-generic-bin", "", ""),
|
||||
//testFile("text/vnd.iptc.anpa", "", ""),
|
||||
//testFile("application/x-msmetafile", "", ""),
|
||||
//testFile("application/x-wms", "", ""),
|
||||
//testFile("video/x-oggrgb", "", ""),
|
||||
//testFile("image/xcf", "", ""),
|
||||
//testFile("application/photoshop", "", ""),
|
||||
//testFile("application/x-lz4", "", ""),
|
||||
//testFile("application/x-7z-compressed", "", ""),
|
||||
//testFile("application/gff", "", ""),
|
||||
//testFile("video/x-oggyuv", "", ""),
|
||||
//testFile("application/x-msdownload", "", ""),
|
||||
//testFile("image/icns", "", ""),
|
||||
//testFile("application/x-emf", "", ""),
|
||||
//testFile("application/x-geo-pdf", "", ""),
|
||||
//testFile("video/x-ogg-uvs", "", ""),
|
||||
// testFile("image/x-jbig2", "", ""),
|
||||
// testFile("image/nitf", "", ""),
|
||||
// testFile("application/mbox", "", ""),
|
||||
// testFile("application/chm", "", ""),
|
||||
// testFile("application/x-fast", "", ""),
|
||||
// testFile("application/x-gsc", "", ""),
|
||||
// testFile("application/x-deflate", "", ""),
|
||||
// testFile("application/x-grib2", "", ""),
|
||||
// testFile("image/x-ozi", "", ""),
|
||||
// testFile("application/x-pds", "", ""),
|
||||
// testFile("application/vnd.apple.iwork", "", ""),
|
||||
// testFile("application/x-usgs-dem", "", ""),
|
||||
// testFile("application/vnd.ms-excel.sheet.2", "", ""),
|
||||
// testFile("application/vnd.ms-excel.sheet.3", "", ""),
|
||||
// testFile("application/dif+xml", "", ""),
|
||||
// testFile("application/vnd.ms-excel.sheet.4", "", ""),
|
||||
// testFile("application/x-java", "", ""),
|
||||
// testFile("image/geotiff", "", ""),
|
||||
// testFile("application/x-gsag", "", ""),
|
||||
// testFile("application/x-snappy", "", ""),
|
||||
// testFile("video/x-theora", "", ""),
|
||||
// testFile("image/ntf", "", ""),
|
||||
// testFile("application/x-pdf", "", ""),
|
||||
// testFile("application/xml", "", ""),
|
||||
// testFile("application/vnd.wordperfect; version=6.x", "", ""),
|
||||
// testFile("application/pkcs7-signature", "", ""),
|
||||
// testFile("application/vnd.wordperfect; version=5.1", "", ""),
|
||||
// testFile("application/vnd.wordperfect; version=5.0", "", ""),
|
||||
// testFile("application/x-arj-compressed", "", ""),
|
||||
// testFile("application/geotopic", "", ""),
|
||||
// testFile("text/x-java-source", "java", ""),
|
||||
// testFile("audio/basic", "au", ""),
|
||||
// testFile("application/pcisdk", "", ""),
|
||||
// testFile("application/x-rik", "", ""),
|
||||
// testFile("audio/opus", "", ""),
|
||||
// testFile(MIMETYPE_IMAGE_JP2, "jp2", ""),
|
||||
// testFile("application/x-gtx", "", ""),
|
||||
// testFile("application/x-object", "", ""),
|
||||
// testFile("application/vnd.ms-wordml", "", ""),
|
||||
// testFile("image/x-wmf", "", ""),
|
||||
// testFile("application/x-rpf-toc", "", ""),
|
||||
// testFile("application/x-srtmhgt", "", ""),
|
||||
// testFile("application/x-generic-bin", "", ""),
|
||||
// testFile("text/vnd.iptc.anpa", "", ""),
|
||||
// testFile("application/x-msmetafile", "", ""),
|
||||
// testFile("application/x-wms", "", ""),
|
||||
// testFile("video/x-oggrgb", "", ""),
|
||||
// testFile("image/xcf", "", ""),
|
||||
// testFile("application/photoshop", "", ""),
|
||||
// testFile("application/x-lz4", "", ""),
|
||||
// testFile("application/x-7z-compressed", "", ""),
|
||||
// testFile("application/gff", "", ""),
|
||||
// testFile("video/x-oggyuv", "", ""),
|
||||
// testFile("application/x-msdownload", "", ""),
|
||||
// testFile("image/icns", "", ""),
|
||||
// testFile("application/x-emf", "", ""),
|
||||
// testFile("application/x-geo-pdf", "", ""),
|
||||
// testFile("video/x-ogg-uvs", "", ""),
|
||||
testFile(MIMETYPE_VIDEO_FLV, "flv", "quick.flv"),
|
||||
//testFile("application/x-zip-compressed", "", ""),
|
||||
//testFile("application/gzip", "", ""),
|
||||
//testFile("application/x-tika-unix-dump", "", ""),
|
||||
//testFile("application/x-coasp", "", ""),
|
||||
//testFile("application/x-dipex", "", ""),
|
||||
//testFile("application/x-til", "", ""),
|
||||
//testFile("application/x-gzip", "gzip", ""),
|
||||
//testFile("application/x-gs7bg", "", ""),
|
||||
//testFile("application/x-unix-archive", "", ""),
|
||||
//testFile("application/x-elf", "", ""),
|
||||
//testFile("application/dted", "", ""),
|
||||
//testFile("application/x-rasterlite", "", ""),
|
||||
//testFile("audio/x-mp4a", "", ""),
|
||||
//testFile("application/x-gzip-compressed", "", ""),
|
||||
//testFile("application/x-chm", "", ""),
|
||||
//testFile("image/hfa", "", ""),
|
||||
// testFile("application/x-zip-compressed", "", ""),
|
||||
// testFile("application/gzip", "", ""),
|
||||
// testFile("application/x-tika-unix-dump", "", ""),
|
||||
// testFile("application/x-coasp", "", ""),
|
||||
// testFile("application/x-dipex", "", ""),
|
||||
// testFile("application/x-til", "", ""),
|
||||
// testFile("application/x-gzip", "gzip", ""),
|
||||
// testFile("application/x-gs7bg", "", ""),
|
||||
// testFile("application/x-unix-archive", "", ""),
|
||||
// testFile("application/x-elf", "", ""),
|
||||
// testFile("application/dted", "", ""),
|
||||
// testFile("application/x-rasterlite", "", ""),
|
||||
// testFile("audio/x-mp4a", "", ""),
|
||||
// testFile("application/x-gzip-compressed", "", ""),
|
||||
// testFile("application/x-chm", "", ""),
|
||||
// testFile("image/hfa", "", ""),
|
||||
|
||||
// Special test cases from the repo tests
|
||||
// ======================================
|
||||
@@ -571,7 +572,6 @@ public class TikaMetadataExtractsIT extends AbstractMetadataExtractsIT
|
||||
testFile(MIMETYPE_OPENDOCUMENT_TEXT, "odt", "quick.odt"),
|
||||
testFile(MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE, "ott", "quick.ott"),
|
||||
testFile(MIMETYPE_OPENDOCUMENT_FORMULA, "odf", "quick.odf"),
|
||||
testFile(MIMETYPE_PDF, "pdf", "quick.pdf")
|
||||
);
|
||||
testFile(MIMETYPE_PDF, "pdf", "quick.pdf"));
|
||||
}
|
||||
}
|
||||
|
@@ -31,12 +31,11 @@ import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
import org.alfresco.transform.client.model.TransformRequest;
|
||||
import org.alfresco.transform.base.messaging.AbstractQueueIT;
|
||||
import org.alfresco.transform.client.model.TransformRequest;
|
||||
|
||||
/**
|
||||
* @author Lucian Tuca
|
||||
* created on 15/01/2019
|
||||
* @author Lucian Tuca created on 15/01/2019
|
||||
*/
|
||||
public class TikaQueueIT extends AbstractQueueIT
|
||||
{
|
||||
@@ -44,16 +43,16 @@ public class TikaQueueIT extends AbstractQueueIT
|
||||
protected TransformRequest buildRequest()
|
||||
{
|
||||
return TransformRequest
|
||||
.builder()
|
||||
.withRequestId(UUID.randomUUID().toString())
|
||||
.withSourceMediaType(MIMETYPE_OPENXML_WORDPROCESSING)
|
||||
.withTargetMediaType(MIMETYPE_TEXT_PLAIN)
|
||||
.withTargetExtension("txt")
|
||||
.withSchema(1)
|
||||
.withClientData("ACS")
|
||||
.withSourceReference(UUID.randomUUID().toString())
|
||||
.withSourceSize(32L)
|
||||
.withInternalContextForTransformEngineTests()
|
||||
.build();
|
||||
.builder()
|
||||
.withRequestId(UUID.randomUUID().toString())
|
||||
.withSourceMediaType(MIMETYPE_OPENXML_WORDPROCESSING)
|
||||
.withTargetMediaType(MIMETYPE_TEXT_PLAIN)
|
||||
.withTargetExtension("txt")
|
||||
.withSchema(1)
|
||||
.withClientData("ACS")
|
||||
.withSourceReference(UUID.randomUUID().toString())
|
||||
.withSourceSize(32L)
|
||||
.withInternalContextForTransformEngineTests()
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
@@ -26,22 +26,24 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import static java.text.MessageFormat.format;
|
||||
import static java.util.function.Function.identity;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
import static org.springframework.http.HttpStatus.OK;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.alfresco.transform.base.clients.HttpClient;
|
||||
import org.apache.commons.lang3.tuple.Triple;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
import org.springframework.core.io.Resource;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static java.text.MessageFormat.format;
|
||||
import static java.util.function.Function.identity;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
import static org.springframework.http.HttpStatus.OK;
|
||||
import org.alfresco.transform.base.clients.HttpClient;
|
||||
|
||||
/**
|
||||
* @author Cezar Leahu
|
||||
@@ -50,10 +52,10 @@ public class TikaTransformationIT
|
||||
{
|
||||
private static final String ENGINE_URL = "http://localhost:8090";
|
||||
private static final Map<String, String> extensionMimetype = ImmutableMap.of(
|
||||
"html", "text/html",
|
||||
"txt", "text/plain",
|
||||
"xhtml", "application/xhtml+xml",
|
||||
"xml", "text/xml");
|
||||
"html", "text/html",
|
||||
"txt", "text/plain",
|
||||
"xhtml", "application/xhtml+xml",
|
||||
"xml", "text/xml");
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("engineTransformations")
|
||||
@@ -63,7 +65,7 @@ public class TikaTransformationIT
|
||||
final String sourceMimetype = entry.getRight();
|
||||
final String targetExtension = entry.getMiddle();
|
||||
String targetMimetype;
|
||||
//Single test to cover pdf-->csv
|
||||
// Single test to cover pdf-->csv
|
||||
if (sourceFile.contains("pdf") && targetExtension.contains("csv"))
|
||||
{
|
||||
targetMimetype = "text/csv";
|
||||
@@ -74,13 +76,13 @@ public class TikaTransformationIT
|
||||
}
|
||||
|
||||
final String descriptor = format("Transform ({0}, {1} -> {2}, {3})",
|
||||
sourceFile, sourceMimetype, targetMimetype, targetExtension);
|
||||
sourceFile, sourceMimetype, targetMimetype, targetExtension);
|
||||
try
|
||||
{
|
||||
final ResponseEntity<Resource> response = HttpClient.sendTRequest(ENGINE_URL, sourceFile, null,
|
||||
targetMimetype, targetExtension, ImmutableMap.of(
|
||||
"targetEncoding", "UTF-8",
|
||||
"sourceMimetype", sourceMimetype));
|
||||
targetMimetype, targetExtension, ImmutableMap.of(
|
||||
"targetEncoding", "UTF-8",
|
||||
"sourceMimetype", sourceMimetype));
|
||||
assertEquals(OK, response.getStatusCode(), descriptor);
|
||||
}
|
||||
catch (Exception e)
|
||||
@@ -90,80 +92,76 @@ public class TikaTransformationIT
|
||||
}
|
||||
|
||||
private static Stream<Triple<String, String, String>> allTargets(final String sourceFile,
|
||||
final String sourceMimetype)
|
||||
final String sourceMimetype)
|
||||
{
|
||||
return extensionMimetype
|
||||
.keySet()
|
||||
.stream()
|
||||
.map(k -> Triple.of(sourceFile, k, sourceMimetype));
|
||||
.keySet()
|
||||
.stream()
|
||||
.map(k -> Triple.of(sourceFile, k, sourceMimetype));
|
||||
}
|
||||
|
||||
// TODO unit tests for the following file types (for which is difficult to find file samples):
|
||||
// *.ogx (application/ogg)
|
||||
// *.cpio (application/x-cpio)
|
||||
// *.cdf (application/x-netcdf)
|
||||
// *.hdf (application/x-hdf)
|
||||
// *.ogx (application/ogg)
|
||||
// *.cpio (application/x-cpio)
|
||||
// *.cdf (application/x-netcdf)
|
||||
// *.hdf (application/x-hdf)
|
||||
public static Stream<Triple<String, String, String>> engineTransformations()
|
||||
{
|
||||
return Stream
|
||||
.of(
|
||||
allTargets("quick.doc", "application/msword"),
|
||||
allTargets("quick.docx",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
|
||||
allTargets("quick.html", "text/html"),
|
||||
allTargets("quick.jar", "application/java-archive"),
|
||||
allTargets("quick.java", "text/x-java-source"),
|
||||
Stream.of(
|
||||
Triple.of("quick.key", "html", "application/vnd.apple.keynote"),
|
||||
// Does not work, alfresco-docker-sourceMimetype-misc can handle this target mimetype, removed from engine_config.json
|
||||
Triple.of("quick.key", "txt", "application/vnd.apple.keynote"),
|
||||
Triple.of("quick.key", "xhtml", "application/vnd.apple.keynote"),
|
||||
Triple.of("quick.key", "xml", "application/vnd.apple.keynote")
|
||||
),
|
||||
allTargets("quick.msg", "application/vnd.ms-outlook"),
|
||||
Stream.of(
|
||||
Triple.of("quick.numbers", "html", "application/vnd.apple.numbers"),
|
||||
// Does not work, alfresco-docker-sourceMimetype-misc can handle this target mimetype, removed from engine_config.json
|
||||
// Triple.of("quick.numbers", "txt", "TikaAuto"),
|
||||
Triple.of("quick.numbers", "xhtml", "application/vnd.apple.numbers"),
|
||||
Triple.of("quick.numbers", "xml", "application/vnd.apple.numbers")
|
||||
),
|
||||
Stream.of(
|
||||
Triple.of("quick.pdf", "csv", "application/pdf")
|
||||
),
|
||||
allTargets("quick.odp", "application/vnd.oasis.opendocument.presentation"),
|
||||
allTargets("quick.ods", "application/vnd.oasis.opendocument.spreadsheet"),
|
||||
allTargets("quick.odt", "application/vnd.oasis.opendocument.text"),
|
||||
allTargets("quick.otp", "application/vnd.oasis.opendocument.presentation-template"),
|
||||
allTargets("quick.ots", "application/vnd.oasis.opendocument.spreadsheet-template"),
|
||||
allTargets("quick.ott", "application/vnd.oasis.opendocument.text-template"),
|
||||
Stream.of(
|
||||
Triple.of("quick.pages", "html", "application/vnd.apple.pages"),
|
||||
// Does not work, alfresco-docker-sourceMimetype-misc can handle this target mimetype, removed from engine_config.json
|
||||
// Triple.of("quick.pages", "txt", "TikaAuto"),
|
||||
Triple.of("quick.pages", "xhtml", "application/vnd.apple.pages"),
|
||||
Triple.of("quick.pages", "xml", "application/vnd.apple.pages")
|
||||
),
|
||||
allTargets("quick.pdf", "application/pdf"),
|
||||
allTargets("quick.ppt", "application/vnd.ms-powerpoint"),
|
||||
allTargets("quick.pptx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
|
||||
allTargets("quick.sxw", "application/vnd.sun.xml.writer"),
|
||||
allTargets("quick.txt", "text/plain"),
|
||||
allTargets("quick.vsd", "application/vnd.visio"),
|
||||
allTargets("quick.xls", "application/vnd.ms-excel"),
|
||||
allTargets("quick.xlsx",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
|
||||
allTargets("quick.zip", "application/zip"),
|
||||
allTargets("quick.tar", "application/x-tar"),
|
||||
allTargets("sample.rtf", "application/rtf"),
|
||||
allTargets("quick.xml", "text/xml"),
|
||||
allTargets("sample.xhtml.txt", "application/xhtml+xml"),
|
||||
allTargets("sample.rss", "application/rss+xml"),
|
||||
//allTargets("quick.rar", "application/x-rar-compressed"),
|
||||
allTargets("quick.z", "application/x-compress"),
|
||||
allTargets("quick.csv", "text/csv"),
|
||||
allTargets("quick.tar.gz", "application/x-gzip"))
|
||||
.flatMap(identity());
|
||||
.of(
|
||||
allTargets("quick.doc", "application/msword"),
|
||||
allTargets("quick.docx",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
|
||||
allTargets("quick.html", "text/html"),
|
||||
allTargets("quick.jar", "application/java-archive"),
|
||||
allTargets("quick.java", "text/x-java-source"),
|
||||
Stream.of(
|
||||
Triple.of("quick.key", "html", "application/vnd.apple.keynote"),
|
||||
// Does not work, alfresco-docker-sourceMimetype-misc can handle this target mimetype, removed from engine_config.json
|
||||
Triple.of("quick.key", "txt", "application/vnd.apple.keynote"),
|
||||
Triple.of("quick.key", "xhtml", "application/vnd.apple.keynote"),
|
||||
Triple.of("quick.key", "xml", "application/vnd.apple.keynote")),
|
||||
allTargets("quick.msg", "application/vnd.ms-outlook"),
|
||||
Stream.of(
|
||||
Triple.of("quick.numbers", "html", "application/vnd.apple.numbers"),
|
||||
// Does not work, alfresco-docker-sourceMimetype-misc can handle this target mimetype, removed from engine_config.json
|
||||
// Triple.of("quick.numbers", "txt", "TikaAuto"),
|
||||
Triple.of("quick.numbers", "xhtml", "application/vnd.apple.numbers"),
|
||||
Triple.of("quick.numbers", "xml", "application/vnd.apple.numbers")),
|
||||
Stream.of(
|
||||
Triple.of("quick.pdf", "csv", "application/pdf")),
|
||||
allTargets("quick.odp", "application/vnd.oasis.opendocument.presentation"),
|
||||
allTargets("quick.ods", "application/vnd.oasis.opendocument.spreadsheet"),
|
||||
allTargets("quick.odt", "application/vnd.oasis.opendocument.text"),
|
||||
allTargets("quick.otp", "application/vnd.oasis.opendocument.presentation-template"),
|
||||
allTargets("quick.ots", "application/vnd.oasis.opendocument.spreadsheet-template"),
|
||||
allTargets("quick.ott", "application/vnd.oasis.opendocument.text-template"),
|
||||
Stream.of(
|
||||
Triple.of("quick.pages", "html", "application/vnd.apple.pages"),
|
||||
// Does not work, alfresco-docker-sourceMimetype-misc can handle this target mimetype, removed from engine_config.json
|
||||
// Triple.of("quick.pages", "txt", "TikaAuto"),
|
||||
Triple.of("quick.pages", "xhtml", "application/vnd.apple.pages"),
|
||||
Triple.of("quick.pages", "xml", "application/vnd.apple.pages")),
|
||||
allTargets("quick.pdf", "application/pdf"),
|
||||
allTargets("quick.ppt", "application/vnd.ms-powerpoint"),
|
||||
allTargets("quick.pptx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
|
||||
allTargets("quick.sxw", "application/vnd.sun.xml.writer"),
|
||||
allTargets("quick.txt", "text/plain"),
|
||||
allTargets("quick.vsd", "application/vnd.visio"),
|
||||
allTargets("quick.xls", "application/vnd.ms-excel"),
|
||||
allTargets("quick.xlsx",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
|
||||
allTargets("quick.zip", "application/zip"),
|
||||
allTargets("quick.tar", "application/x-tar"),
|
||||
allTargets("sample.rtf", "application/rtf"),
|
||||
allTargets("quick.xml", "text/xml"),
|
||||
allTargets("sample.xhtml.txt", "application/xhtml+xml"),
|
||||
allTargets("sample.rss", "application/rss+xml"),
|
||||
// allTargets("quick.rar", "application/x-rar-compressed"),
|
||||
allTargets("quick.z", "application/x-compress"),
|
||||
allTargets("quick.csv", "text/csv"),
|
||||
allTargets("quick.tar.gz", "application/x-gzip"))
|
||||
.flatMap(identity());
|
||||
}
|
||||
}
|
||||
|
@@ -26,7 +26,15 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.embedders;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EMBEDDER;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import org.apache.poi.ooxml.POIXMLProperties;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.apache.tika.embedder.Embedder;
|
||||
@@ -39,18 +47,10 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import static org.alfresco.transform.base.metadata.AbstractMetadataExtractorEmbedder.Type.EMBEDDER;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
|
||||
/**
|
||||
* Sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add
|
||||
* metadata. This is not production code, so no supported mimetypes exist in the {@code tika_engine_config.json}.
|
||||
* Sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add metadata. This is not production code, so no supported mimetypes exist in the {@code tika_engine_config.json}.
|
||||
*/
|
||||
@Component
|
||||
public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractorEmbeddor
|
||||
@@ -76,8 +76,7 @@ public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractorEmbeddor
|
||||
|
||||
private static class SamplePoiEmbedder implements Embedder
|
||||
{
|
||||
private static final Set<MediaType> SUPPORTED_EMBED_TYPES =
|
||||
Collections.singleton(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
|
||||
private static final Set<MediaType> SUPPORTED_EMBED_TYPES = Collections.singleton(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
|
||||
|
||||
@Override
|
||||
public Set<MediaType> getSupportedEmbedTypes(ParseContext parseContext)
|
||||
@@ -115,19 +114,19 @@ public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractorEmbeddor
|
||||
}
|
||||
switch (name)
|
||||
{
|
||||
case "author":
|
||||
coreProp.setCreator(value);
|
||||
break;
|
||||
case "title":
|
||||
coreProp.setTitle(value);
|
||||
break;
|
||||
case "description":
|
||||
coreProp.setDescription(value);
|
||||
break;
|
||||
// There are other core values but this is sample code, so we will assume it is a custom value.
|
||||
default:
|
||||
custProp.addProperty(name, value);
|
||||
break;
|
||||
case "author":
|
||||
coreProp.setCreator(value);
|
||||
break;
|
||||
case "title":
|
||||
coreProp.setTitle(value);
|
||||
break;
|
||||
case "description":
|
||||
coreProp.setDescription(value);
|
||||
break;
|
||||
// There are other core values but this is sample code, so we will assume it is a custom value.
|
||||
default:
|
||||
custProp.addProperty(name, value);
|
||||
break;
|
||||
}
|
||||
}
|
||||
workbook.write(outputStream);
|
||||
|
@@ -26,20 +26,21 @@
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class IPTCMetadataExtractorTest
|
||||
{
|
||||
IPTCMetadataExtractor extractor = new IPTCMetadataExtractor(null);
|
||||
|
||||
@Test
|
||||
public void testIptcToIso8601DateStrings() {
|
||||
String[] testStrings = { "1890:01:01", "1901:02:01 00:00:00.000Z", "1901-02-01 00:00:00.000Z",
|
||||
"1901-02-01T00:00:00.000Z", "1901:02:01T00:00+00:00", "1901:02:01 00:00+00:00" };
|
||||
String[] expected = { "1890-01-01", "1901-02-01T00:00:00.000Z", "1901-02-01T00:00:00.000Z",
|
||||
"1901-02-01T00:00:00.000Z", "1901-02-01T00:00+00:00", "1901-02-01T00:00+00:00" };
|
||||
public void testIptcToIso8601DateStrings()
|
||||
{
|
||||
String[] testStrings = {"1890:01:01", "1901:02:01 00:00:00.000Z", "1901-02-01 00:00:00.000Z",
|
||||
"1901-02-01T00:00:00.000Z", "1901:02:01T00:00+00:00", "1901:02:01 00:00+00:00"};
|
||||
String[] expected = {"1890-01-01", "1901-02-01T00:00:00.000Z", "1901-02-01T00:00:00.000Z",
|
||||
"1901-02-01T00:00:00.000Z", "1901-02-01T00:00+00:00", "1901-02-01T00:00+00:00"};
|
||||
|
||||
assertArrayEquals(expected, extractor.iptcToIso8601DateStrings(testStrings));
|
||||
}
|
||||
|
@@ -1,59 +1,61 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.parsers;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class ExifToolParserTest {
|
||||
|
||||
ExifToolParser exifToolParser = new ExifToolParser();
|
||||
|
||||
@Test
|
||||
public void testFindSeparator() {
|
||||
|
||||
String testCommand = "env FOO=${OUTPUT} exiftool -args -G1 " + ExifToolParser.SEPARATOR_SETTING
|
||||
+ " \"|||\" ${INPUT}";
|
||||
String expected = "|||";
|
||||
String actual = exifToolParser.findSeparator(testCommand);
|
||||
assertEquals(expected, actual);
|
||||
|
||||
expected = "TESTWITHOUTQUOTES";
|
||||
testCommand = "nothing matters until the " + ExifToolParser.SEPARATOR_SETTING + " " + expected
|
||||
+ " now all this extra should be ignored";
|
||||
actual = exifToolParser.findSeparator(testCommand);
|
||||
assertEquals(expected, actual);
|
||||
|
||||
expected = "Test something bonkers 112!£$%^£$^";
|
||||
testCommand = ExifToolParser.SEPARATOR_SETTING + " \""+expected+"\"";
|
||||
actual = exifToolParser.findSeparator(testCommand);
|
||||
assertEquals(expected, actual);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.parsers;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class ExifToolParserTest
|
||||
{
|
||||
|
||||
ExifToolParser exifToolParser = new ExifToolParser();
|
||||
|
||||
@Test
|
||||
public void testFindSeparator()
|
||||
{
|
||||
|
||||
String testCommand = "env FOO=${OUTPUT} exiftool -args -G1 " + ExifToolParser.SEPARATOR_SETTING
|
||||
+ " \"|||\" ${INPUT}";
|
||||
String expected = "|||";
|
||||
String actual = exifToolParser.findSeparator(testCommand);
|
||||
assertEquals(expected, actual);
|
||||
|
||||
expected = "TESTWITHOUTQUOTES";
|
||||
testCommand = "nothing matters until the " + ExifToolParser.SEPARATOR_SETTING + " " + expected
|
||||
+ " now all this extra should be ignored";
|
||||
actual = exifToolParser.findSeparator(testCommand);
|
||||
assertEquals(expected, actual);
|
||||
|
||||
expected = "Test something bonkers 112!£$%^£$^";
|
||||
testCommand = ExifToolParser.SEPARATOR_SETTING + " \"" + expected + "\"";
|
||||
actual = exifToolParser.findSeparator(testCommand);
|
||||
assertEquals(expected, actual);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -1,138 +1,140 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import static org.alfresco.transform.tika.transformers.Tika.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TARGET_ENCODING;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TARGET_MIMETYPE;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.clearInvocations;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.spy;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.alfresco.transform.base.TransformManager;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class TikaExtractBookmarksTest
|
||||
{
|
||||
private static class TikaTestTransformer extends AbstractTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
TikaTestTransformer(boolean notExtractBookmarksTextDefault)
|
||||
{
|
||||
this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNotExtractBookmarkTextDefault() throws Exception
|
||||
{
|
||||
AbstractTikaTransformer executorSpyDefaultTrue = spy(new TikaTestTransformer(true));
|
||||
AbstractTikaTransformer executorSpyDefaultFalse = spy(new TikaTestTransformer(false));
|
||||
|
||||
InputStream mockInputStream = mock(InputStream.class);
|
||||
OutputStream mockOutputStream = mock(OutputStream.class);
|
||||
TransformManager mockTransformManager = mock(TransformManager.class);
|
||||
String sourceMimetype = "sourceMimetype";
|
||||
String targetMimetype = "targetMimetype";
|
||||
String defaultEncoding = "UTF-8";
|
||||
|
||||
// no need to continue execution passed here or check values as we're checking the correct params passed to this method later.
|
||||
lenient().doNothing().when(executorSpyDefaultTrue).call(any(), any(), any(), any(), any(), any());
|
||||
lenient().doNothing().when(executorSpyDefaultFalse).call(any(), any(), any(), any(), any(), any());
|
||||
|
||||
Map<String, String> transformOptions = new HashMap<>();
|
||||
|
||||
// use empty transformOptions to test defaults
|
||||
executorSpyDefaultTrue.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
executorSpyDefaultFalse.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
|
||||
// when default set to true, with no options passed we should get a call method with NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockInputStream, mockOutputStream, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// when default set to false, with no options passed we should get a call method without NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockInputStream, mockOutputStream, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// use transforms with notExtractBookmarksText set to true
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.put("notExtractBookmarksText", "true");
|
||||
executorSpyDefaultTrue.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
executorSpyDefaultFalse.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockInputStream, mockOutputStream, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockInputStream, mockOutputStream, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// use transforms with notExtractBookmarksText set to false
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.replace("notExtractBookmarksText", "true", "false");
|
||||
executorSpyDefaultTrue.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
executorSpyDefaultFalse.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockInputStream, mockOutputStream, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockInputStream, mockOutputStream, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// useful set of pdfbox transformOptions just to be safe
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.put("targetEncoding", "anyEncoding");
|
||||
executorSpyDefaultTrue.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
executorSpyDefaultFalse.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT but the encoding will change
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockInputStream, mockOutputStream, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding");
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockInputStream, mockOutputStream, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding");
|
||||
}
|
||||
}
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.clearInvocations;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.spy;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
|
||||
import static org.alfresco.transform.tika.transformers.Tika.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TARGET_ENCODING;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TARGET_MIMETYPE;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import org.alfresco.transform.base.TransformManager;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class TikaExtractBookmarksTest
|
||||
{
|
||||
private static class TikaTestTransformer extends AbstractTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
TikaTestTransformer(boolean notExtractBookmarksTextDefault)
|
||||
{
|
||||
this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNotExtractBookmarkTextDefault() throws Exception
|
||||
{
|
||||
AbstractTikaTransformer executorSpyDefaultTrue = spy(new TikaTestTransformer(true));
|
||||
AbstractTikaTransformer executorSpyDefaultFalse = spy(new TikaTestTransformer(false));
|
||||
|
||||
InputStream mockInputStream = mock(InputStream.class);
|
||||
OutputStream mockOutputStream = mock(OutputStream.class);
|
||||
TransformManager mockTransformManager = mock(TransformManager.class);
|
||||
String sourceMimetype = "sourceMimetype";
|
||||
String targetMimetype = "targetMimetype";
|
||||
String defaultEncoding = "UTF-8";
|
||||
|
||||
// no need to continue execution passed here or check values as we're checking the correct params passed to this method later.
|
||||
lenient().doNothing().when(executorSpyDefaultTrue).call(any(), any(), any(), any(), any(), any());
|
||||
lenient().doNothing().when(executorSpyDefaultFalse).call(any(), any(), any(), any(), any(), any());
|
||||
|
||||
Map<String, String> transformOptions = new HashMap<>();
|
||||
|
||||
// use empty transformOptions to test defaults
|
||||
executorSpyDefaultTrue.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
executorSpyDefaultFalse.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
|
||||
// when default set to true, with no options passed we should get a call method with NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockInputStream, mockOutputStream, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// when default set to false, with no options passed we should get a call method without NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockInputStream, mockOutputStream, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// use transforms with notExtractBookmarksText set to true
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.put("notExtractBookmarksText", "true");
|
||||
executorSpyDefaultTrue.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
executorSpyDefaultFalse.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockInputStream, mockOutputStream, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockInputStream, mockOutputStream, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// use transforms with notExtractBookmarksText set to false
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.replace("notExtractBookmarksText", "true", "false");
|
||||
executorSpyDefaultTrue.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
executorSpyDefaultFalse.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockInputStream, mockOutputStream, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockInputStream, mockOutputStream, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// useful set of pdfbox transformOptions just to be safe
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.put("targetEncoding", "anyEncoding");
|
||||
executorSpyDefaultTrue.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
executorSpyDefaultFalse.transform(sourceMimetype, mockInputStream, targetMimetype, mockOutputStream, transformOptions, mockTransformManager);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT but the encoding will change
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockInputStream, mockOutputStream, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding");
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockInputStream, mockOutputStream, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding");
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user