mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-08-21 18:08:37 +00:00
Save point: Simpler project structure for core t-engines
This commit is contained in:
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import org.alfresco.transform.base.TransformEngine;
|
||||
import org.alfresco.transform.base.probes.ProbeTestTransform;
|
||||
import org.alfresco.transform.common.TransformConfigResourceReader;
|
||||
import org.alfresco.transform.config.TransformConfig;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
import static org.alfresco.transform.base.logging.StandardMessages.COMMUNITY_LICENCE;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_PDF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
|
||||
@Component
|
||||
public class TikaTransformEngine implements TransformEngine
|
||||
{
|
||||
@Autowired
|
||||
private TransformConfigResourceReader transformConfigResourceReader;
|
||||
|
||||
@Override
|
||||
public String getTransformEngineName()
|
||||
{
|
||||
return "0010-Tika";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getStartupMessage()
|
||||
{
|
||||
return COMMUNITY_LICENCE +
|
||||
"This transformer uses Tika from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt\n" +
|
||||
"This transformer uses ExifTool by Phil Harvey. See license at https://exiftool.org/#license. or in /Perl-Artistic-License.txt";
|
||||
}
|
||||
|
||||
@Override
|
||||
public TransformConfig getTransformConfig()
|
||||
{
|
||||
return transformConfigResourceReader.read("classpath:tika_engine_config.json");
|
||||
}
|
||||
|
||||
@Override
|
||||
public ProbeTestTransform getLivenessAndReadinessProbeTestTransform()
|
||||
{
|
||||
return new ProbeTestTransform("quick.pdf", "quick.txt",
|
||||
MIMETYPE_PDF, MIMETYPE_TEXT_PLAIN, Collections.emptyMap(),
|
||||
60, 16, 400, 10240, 60 * 30 + 1, 60 * 15 + 20);
|
||||
}
|
||||
}
|
@@ -0,0 +1,525 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.alfresco.transform.base.CustomTransformer;
|
||||
import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor;
|
||||
import org.alfresco.transform.common.TransformException;
|
||||
import org.apache.tika.embedder.Embedder;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.DublinCore;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Property;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.sax.ContentHandlerDecorator;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.apache.tika.sax.xpath.Matcher;
|
||||
import org.apache.tika.sax.xpath.MatchingContentHandler;
|
||||
import org.apache.tika.sax.xpath.XPathParser;
|
||||
import org.joda.time.DateTimeZone;
|
||||
import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
import org.joda.time.format.DateTimeFormatterBuilder;
|
||||
import org.joda.time.format.DateTimeParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.Locator;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* The parent of all Metadata Extractors which use Apache Tika under the hood. This handles all the
|
||||
* common parts of processing the files, and the common mappings.
|
||||
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>subject:</b> -- cm:description
|
||||
* <b>created:</b> -- cm:created
|
||||
* <b>comments:</b>
|
||||
* </pre>
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtractor implements CustomTransformer
|
||||
{
|
||||
protected static final String KEY_AUTHOR = "author";
|
||||
protected static final String KEY_TITLE = "title";
|
||||
protected static final String KEY_SUBJECT = "subject";
|
||||
protected static final String KEY_CREATED = "created";
|
||||
protected static final String KEY_DESCRIPTION = "description";
|
||||
protected static final String KEY_COMMENTS = "comments";
|
||||
protected static final String KEY_TAGS = DublinCore.SUBJECT.getName();
|
||||
|
||||
private static final String METADATA_SEPARATOR = ",";
|
||||
|
||||
private final DateTimeFormatter tikaUTCDateFormater;
|
||||
private final DateTimeFormatter tikaDateFormater;
|
||||
|
||||
public AbstractTikaMetadataExtractor(Type type, Logger logger)
|
||||
{
|
||||
super(type, logger);
|
||||
|
||||
// TODO Once TIKA-451 is fixed this list will get nicer
|
||||
DateTimeParser[] parsersUTC = {
|
||||
DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss'Z'").getParser(),
|
||||
DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ssZ").getParser()
|
||||
};
|
||||
DateTimeParser[] parsers = {
|
||||
DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss").getParser(),
|
||||
DateTimeFormat.forPattern("yyyy-MM-dd").getParser(),
|
||||
DateTimeFormat.forPattern("yyyy/MM/dd HH:mm:ss").getParser(),
|
||||
DateTimeFormat.forPattern("yyyy/MM/dd").getParser(),
|
||||
DateTimeFormat.forPattern("EEE MMM dd hh:mm:ss zzz yyyy").getParser()
|
||||
};
|
||||
|
||||
tikaUTCDateFormater = new DateTimeFormatterBuilder().append(null, parsersUTC).toFormatter().withZone(DateTimeZone.UTC);
|
||||
tikaDateFormater = new DateTimeFormatterBuilder().append(null, parsers).toFormatter();
|
||||
}
|
||||
|
||||
/**
|
||||
* Version which also tries the ISO-8601 formats (in order..),
|
||||
* and similar formats, which Tika makes use of
|
||||
*/
|
||||
protected Serializable makeDate(String dateStr)
|
||||
{
|
||||
// Try our formats first, in order
|
||||
try
|
||||
{
|
||||
return this.tikaUTCDateFormater.parseDateTime(dateStr).toDate();
|
||||
}
|
||||
catch (IllegalArgumentException ignore) {}
|
||||
|
||||
try
|
||||
{
|
||||
return this.tikaUTCDateFormater.withLocale(Locale.US).parseDateTime(dateStr).toDate();
|
||||
}
|
||||
catch (IllegalArgumentException ignore) {}
|
||||
|
||||
try
|
||||
{
|
||||
return this.tikaDateFormater.parseDateTime(dateStr).toDate();
|
||||
}
|
||||
catch (IllegalArgumentException ignore) {}
|
||||
|
||||
try
|
||||
{
|
||||
return this.tikaDateFormater.withLocale(Locale.US).parseDateTime(dateStr).toDate();
|
||||
}
|
||||
catch (IllegalArgumentException ignore) {}
|
||||
|
||||
// Fall back to the normal ones: We just return the String as AbstractMappingMetadataExtracter
|
||||
// convertSystemPropertyValues in the repo will do the conversion that was previously done here.
|
||||
return dateStr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the correct Tika Parser to process the document.
|
||||
* If you don't know which you want, use {@link TikaAutoMetadataExtractor}
|
||||
* which makes use of the Tika auto-detection.
|
||||
*/
|
||||
protected abstract Parser getParser();
|
||||
|
||||
/**
|
||||
* Returns the Tika Embedder to modify
|
||||
* the document.
|
||||
*
|
||||
* @return the Tika embedder
|
||||
*/
|
||||
protected Embedder getEmbedder()
|
||||
{
|
||||
// TODO make this an abstract method once more extracters support embedding
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Do we care about the contents of the
|
||||
* extracted header, or nothing at all?
|
||||
*/
|
||||
protected boolean needHeaderContents()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allows implementation specific mappings to be done.
|
||||
*/
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
{
|
||||
return properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the document selector, used for determining whether to parse embedded resources,
|
||||
* null by default so parse all.
|
||||
*/
|
||||
protected DocumentSelector getDocumentSelector(Metadata metadata, String targetMimeType)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* By default returns a new ParseContent
|
||||
*/
|
||||
private ParseContext buildParseContext(Metadata metadata, String sourceMimeType)
|
||||
{
|
||||
ParseContext context = new ParseContext();
|
||||
DocumentSelector selector = getDocumentSelector(metadata, sourceMimeType);
|
||||
if (selector != null)
|
||||
{
|
||||
context.set(DocumentSelector.class, selector);
|
||||
}
|
||||
return context;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Serializable> extractMetadata(String sourceMimetype, Map<String, String> transformOptions,
|
||||
File sourceFile) throws Exception
|
||||
{
|
||||
Map<String, Serializable> rawProperties = new HashMap<>();
|
||||
|
||||
try (InputStream is = new FileInputStream(sourceFile))
|
||||
{
|
||||
Parser parser = getParser();
|
||||
|
||||
Metadata metadata = new Metadata();
|
||||
metadata.add(Metadata.CONTENT_TYPE, sourceMimetype);
|
||||
|
||||
ParseContext context = buildParseContext(metadata, sourceMimetype);
|
||||
|
||||
ContentHandler handler;
|
||||
Map<String,String> headers = null;
|
||||
if (needHeaderContents())
|
||||
{
|
||||
MapCaptureContentHandler headerCapture =
|
||||
new MapCaptureContentHandler();
|
||||
headers = headerCapture.tags;
|
||||
handler = new HeadContentHandler(headerCapture);
|
||||
}
|
||||
else
|
||||
{
|
||||
handler = new NullContentHandler();
|
||||
}
|
||||
|
||||
parser.parse(is, handler, metadata, context);
|
||||
|
||||
// First up, copy all the Tika metadata over
|
||||
// This allows people to map any of the Tika
|
||||
// keys onto their own content model
|
||||
for (String tikaKey : metadata.names())
|
||||
{
|
||||
// TODO review this change (part of MNT-15267) - should we really force string concatenation here !?
|
||||
putRawValue(tikaKey, getMetadataValue(metadata, Property.internalText(tikaKey)), rawProperties);
|
||||
}
|
||||
|
||||
// Now, map the common Tika metadata keys onto
|
||||
// the common Alfresco metadata keys. This allows
|
||||
// existing mapping properties files to continue
|
||||
// to work without needing any changes
|
||||
|
||||
// The simple ones
|
||||
putRawValue(KEY_AUTHOR, getMetadataValue(metadata, TikaCoreProperties.CREATOR), rawProperties);
|
||||
putRawValue(KEY_TITLE, getMetadataValue(metadata, TikaCoreProperties.TITLE), rawProperties);
|
||||
putRawValue(KEY_COMMENTS, getMetadataValue(metadata, TikaCoreProperties.COMMENTS), rawProperties);
|
||||
|
||||
// Tags
|
||||
putRawValue(KEY_TAGS, getMetadataValues(metadata, KEY_TAGS), rawProperties);
|
||||
|
||||
// Get the subject and description, despite things not
|
||||
// being nearly as consistent as one might hope
|
||||
String subject = getMetadataValue(metadata, TikaCoreProperties.SUBJECT);
|
||||
String description = getMetadataValue(metadata, TikaCoreProperties.DESCRIPTION);
|
||||
if (subject != null && description != null)
|
||||
{
|
||||
putRawValue(KEY_DESCRIPTION, description, rawProperties);
|
||||
putRawValue(KEY_SUBJECT, subject, rawProperties);
|
||||
}
|
||||
else if (subject != null)
|
||||
{
|
||||
putRawValue(KEY_DESCRIPTION, subject, rawProperties);
|
||||
putRawValue(KEY_SUBJECT, subject, rawProperties);
|
||||
}
|
||||
else if (description != null)
|
||||
{
|
||||
putRawValue(KEY_DESCRIPTION, description, rawProperties);
|
||||
putRawValue(KEY_SUBJECT, description, rawProperties);
|
||||
}
|
||||
|
||||
// Try for the dates two different ways too
|
||||
if (metadata.get(TikaCoreProperties.CREATED) != null)
|
||||
{
|
||||
putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.CREATED), rawProperties);
|
||||
}
|
||||
else if (metadata.get(TikaCoreProperties.MODIFIED) != null)
|
||||
{
|
||||
putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.MODIFIED), rawProperties);
|
||||
}
|
||||
|
||||
// If people created a specific instance
|
||||
// (eg OfficeMetadataExtractor), then allow that
|
||||
// instance to map the Tika keys onto its
|
||||
// existing namespace so that older properties
|
||||
// files continue to map correctly
|
||||
rawProperties = extractSpecific(metadata, rawProperties, headers);
|
||||
}
|
||||
|
||||
return rawProperties;
|
||||
}
|
||||
|
||||
public void embedMetadata(String sourceMimetype, Map<String, String> transformOptions,
|
||||
String sourceEncoding, InputStream inputStream,
|
||||
String targetEncoding, OutputStream outputStream) throws Exception
|
||||
{
|
||||
// TODO
|
||||
throw new TransformException(500, "TODO embedMetadata");
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated The content repository's TikaPoweredMetadataExtracter provides no non test implementations.
|
||||
* This code exists in case there are custom implementations, that need to be converted to T-Engines.
|
||||
* It is simply a copy and paste from the content repository and has received limited testing.
|
||||
*/
|
||||
@Override
|
||||
public void embedMetadata(String sourceMimetype, String targetMimetype, Map<String, String> transformOptions,
|
||||
File sourceFile, File targetFile) throws Exception
|
||||
{
|
||||
Embedder embedder = getEmbedder();
|
||||
if (embedder == null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
Metadata metadataToEmbed = getTikaMetadata(transformOptions);
|
||||
|
||||
try (InputStream inputStream = new FileInputStream(sourceFile);
|
||||
OutputStream outputStream = new FileOutputStream(targetFile))
|
||||
{
|
||||
embedder.embed(metadataToEmbed, inputStream, outputStream, null);
|
||||
}
|
||||
}
|
||||
|
||||
private Metadata getTikaMetadata(Map<String, String> transformOptions)
|
||||
{
|
||||
Metadata metadataToEmbed = new Metadata();
|
||||
Map<String, Serializable> properties = getMetadata(transformOptions);
|
||||
for (String metadataKey : properties.keySet())
|
||||
{
|
||||
Serializable value = properties.get(metadataKey);
|
||||
if (value == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (value instanceof Collection<?>)
|
||||
{
|
||||
for (Object singleValue : (Collection<?>) value)
|
||||
{
|
||||
try
|
||||
{
|
||||
metadataToEmbed.add(metadataKey, (String)singleValue);
|
||||
}
|
||||
catch (ClassCastException e)
|
||||
{
|
||||
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
try
|
||||
{
|
||||
metadataToEmbed.add(metadataKey, (String)value);
|
||||
}
|
||||
catch (ClassCastException e)
|
||||
{
|
||||
logger.info("Could not convert " + metadataKey + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
return metadataToEmbed;
|
||||
}
|
||||
|
||||
private Serializable getMetadataValues(Metadata metadata, String key)
|
||||
{
|
||||
// Use Set to prevent duplicates.
|
||||
Set<String> valuesSet = new LinkedHashSet<String>();
|
||||
String[] values = metadata.getValues(key);
|
||||
|
||||
for (int i = 0; i < values.length; i++)
|
||||
{
|
||||
String[] parts = values[i].split(METADATA_SEPARATOR);
|
||||
|
||||
for (String subPart : parts)
|
||||
{
|
||||
valuesSet.add(subPart.trim());
|
||||
}
|
||||
}
|
||||
|
||||
Object[] objArrayValues = valuesSet.toArray();
|
||||
values = Arrays.copyOf(objArrayValues, objArrayValues.length, String[].class);
|
||||
|
||||
return values.length == 0 ? null : (values.length == 1 ? values[0] : values);
|
||||
}
|
||||
|
||||
private String getMetadataValue(Metadata metadata, Property key)
|
||||
{
|
||||
if (metadata.isMultiValued(key))
|
||||
{
|
||||
return distinct(metadata.getValues(key)).collect(Collectors.joining(", "));
|
||||
}
|
||||
else
|
||||
{
|
||||
return metadata.get(key);
|
||||
}
|
||||
}
|
||||
|
||||
protected static Stream<String> distinct(final String[] strings)
|
||||
{
|
||||
return Stream.of(strings)
|
||||
.filter(Objects::nonNull)
|
||||
.map(String::strip)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.distinct();
|
||||
}
|
||||
|
||||
/**
|
||||
* This content handler will capture entries from within
|
||||
* the header of the Tika content XHTML, but ignore the
|
||||
* rest.
|
||||
*/
|
||||
protected static class HeadContentHandler extends ContentHandlerDecorator
|
||||
{
|
||||
/**
|
||||
* XHTML XPath parser.
|
||||
*/
|
||||
private static final XPathParser PARSER =
|
||||
new XPathParser("xhtml", XHTMLContentHandler.XHTML);
|
||||
|
||||
/**
|
||||
* The XPath matcher used to select the XHTML body contents.
|
||||
*/
|
||||
private static final Matcher MATCHER =
|
||||
PARSER.parse("/xhtml:html/xhtml:head/descendant:node()");
|
||||
|
||||
/**
|
||||
* Creates a content handler that passes all XHTML body events to the
|
||||
* given underlying content handler.
|
||||
*
|
||||
* @param handler content handler
|
||||
*/
|
||||
protected HeadContentHandler(ContentHandler handler)
|
||||
{
|
||||
super(new MatchingContentHandler(handler, MATCHER));
|
||||
}
|
||||
}
|
||||
/**
|
||||
* This content handler will grab all tags and attributes,
|
||||
* and record the textual content of the last seen one
|
||||
* of them.
|
||||
* Normally only used with {@link HeadContentHandler}
|
||||
*/
|
||||
protected static class MapCaptureContentHandler implements ContentHandler
|
||||
{
|
||||
protected Map<String, String> tags = new HashMap<>();
|
||||
private StringBuffer text;
|
||||
|
||||
public void characters(char[] ch, int start, int len)
|
||||
{
|
||||
if (text != null)
|
||||
{
|
||||
text.append(ch, start, len);
|
||||
}
|
||||
}
|
||||
|
||||
public void endElement(String namespace, String localname, String qname)
|
||||
{
|
||||
if (text != null && text.length() > 0)
|
||||
{
|
||||
tags.put(qname, text.toString());
|
||||
}
|
||||
text = null;
|
||||
}
|
||||
|
||||
public void startElement(String namespace, String localname, String qname, Attributes attrs)
|
||||
{
|
||||
for(int i=0; i<attrs.getLength(); i++)
|
||||
{
|
||||
tags.put(attrs.getQName(i), attrs.getValue(i));
|
||||
}
|
||||
text = new StringBuffer();
|
||||
}
|
||||
|
||||
public void endDocument() {}
|
||||
public void endPrefixMapping(String paramString) {}
|
||||
public void ignorableWhitespace(char[] paramArrayOfChar, int paramInt1, int paramInt2) {}
|
||||
public void processingInstruction(String paramString1, String paramString2) {}
|
||||
public void setDocumentLocator(Locator paramLocator) {}
|
||||
public void skippedEntity(String paramString) {}
|
||||
public void startDocument() {}
|
||||
public void startPrefixMapping(String paramString1, String paramString2) {}
|
||||
}
|
||||
|
||||
/**
|
||||
* A content handler that ignores all the content it finds.
|
||||
* Normally used when we only want the metadata, and don't
|
||||
* care about the file contents.
|
||||
*/
|
||||
protected static class NullContentHandler implements ContentHandler
|
||||
{
|
||||
public void characters(char[] paramArrayOfChar, int paramInt1, int paramInt2) {}
|
||||
public void endDocument() {}
|
||||
public void endElement(String paramString1, String paramString2, String paramString3) {}
|
||||
public void endPrefixMapping(String paramString) {}
|
||||
public void ignorableWhitespace(char[] paramArrayOfChar, int paramInt1, int paramInt2) {}
|
||||
public void processingInstruction(String paramString1, String paramString2) {}
|
||||
public void setDocumentLocator(Locator paramLocator) {}
|
||||
public void skippedEntity(String paramString) {}
|
||||
public void startDocument() {}
|
||||
public void startElement(String paramString1, String paramString2,
|
||||
String paramString3, Attributes paramAttributes) {}
|
||||
public void startPrefixMapping(String paramString1, String paramString2) {}
|
||||
}
|
||||
}
|
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.dwg.DWGParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* {@code "application/dwg"} and {@code "image/vnd.dwg"} metadata extractor.
|
||||
*
|
||||
* Configuration: (see DWGMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>description:</b> -- cm:description
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>keywords:</b>
|
||||
* <b>comments:</b>
|
||||
* <b>lastauthor:</b>
|
||||
* </pre>
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class DWGMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(DWGMetadataExtractor.class);
|
||||
|
||||
private static final String KEY_KEYWORD = "keyword";
|
||||
private static final String KEY_LAST_AUTHOR = "lastAuthor";
|
||||
|
||||
public DWGMetadataExtractor()
|
||||
{
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
{
|
||||
putRawValue(KEY_KEYWORD, metadata.get(TikaCoreProperties.SUBJECT), properties);
|
||||
putRawValue(KEY_LAST_AUTHOR, metadata.get(TikaCoreProperties.MODIFIED), properties);
|
||||
return properties;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return new DWGParser();
|
||||
}
|
||||
}
|
@@ -0,0 +1,162 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.alfresco.transform.tika.parsers.ExifToolParser;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
@Component
|
||||
public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(IPTCMetadataExtractor.class);
|
||||
|
||||
private static Set<String> IPTC_DATE_KEYS = Set.of("XMP-photoshop:DateCreated", "XMP-iptcExt:ArtworkDateCreated");
|
||||
|
||||
private static final Pattern YEAR_IPTC = Pattern.compile("(\\d{4}[:|-]\\d{2}[:|-]\\d{2})");
|
||||
|
||||
private ExifToolParser parser;
|
||||
|
||||
public IPTCMetadataExtractor()
|
||||
{
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
if (this.parser == null) {
|
||||
this.parser = new ExifToolParser();
|
||||
}
|
||||
return this.parser;
|
||||
}
|
||||
|
||||
/**
|
||||
* Because some of the mimetypes that IPTCMetadataExtractor now parse, were previously handled
|
||||
* by TikaAutoMetadataExtractor we call the TikaAutoMetadataExtractor.extractSpecific method to
|
||||
* ensure that the returned properties contains the expected entries.
|
||||
*/
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata, Map<String, Serializable> properties,
|
||||
Map<String, String> headers)
|
||||
{
|
||||
properties = new TikaAutoMetadataExtractor().extractSpecific(metadata, properties, headers);
|
||||
ExifToolParser etParser = (ExifToolParser)this.getParser();
|
||||
if (etParser.getSeparator()!=null)
|
||||
{
|
||||
for (String key : properties.keySet())
|
||||
{
|
||||
if (properties.get(key) instanceof String)
|
||||
{
|
||||
String value = (String) properties.get(key);
|
||||
String separator = etParser.getSeparator();
|
||||
if (value.contains(separator))
|
||||
{
|
||||
if (value.contains(String.format("\"%s\"",separator)))
|
||||
{
|
||||
separator = String.format("\"%s\"",separator);
|
||||
}
|
||||
String [] values = StringUtils.splitByWholeSeparator(value, separator);
|
||||
// Change dateTime format. MM converted ':' to '-'
|
||||
if (IPTC_DATE_KEYS.contains(key)){
|
||||
values = iptcToIso8601DateStrings(values);
|
||||
}
|
||||
putRawValue(key, (Serializable) Arrays.asList(values), properties);
|
||||
}
|
||||
else if (IPTC_DATE_KEYS.contains(key)) {
|
||||
// Handle property with a single date string
|
||||
putRawValue(key, (Serializable) iptcToIso8601DateString(value), properties);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a date or date time strings into Iso8601 format <p>
|
||||
*
|
||||
* @param dateStrings
|
||||
* @return dateStrings in Iso8601 format
|
||||
* @see #iptcToIso8601DateString
|
||||
*/
|
||||
protected String[] iptcToIso8601DateStrings(String[] dateStrings)
|
||||
{
|
||||
for (int i = 0; i < dateStrings.length; i++)
|
||||
{
|
||||
dateStrings[i] = iptcToIso8601DateString(dateStrings[i]);
|
||||
}
|
||||
return dateStrings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a date or date time string into Iso8601 format <p>
|
||||
* Converts any ':' in the year portion of a date string characters to '-'. <p>
|
||||
* Expects the year in the format YYYY:MM:DD or YYYY-MM-DD <p>
|
||||
* Will add the correct delimiter, 'T', to any dateTime strings, where | can be any char other than ,'T':
|
||||
* YYYY:MM:DD|HH:mm:ss.... or YYYY-MM-DD|HH:mm:ss....
|
||||
* <p>
|
||||
* Examples: <p><ul>
|
||||
* <li>"1919:10:16" will convert to "1919-10-16"</li>
|
||||
* <li>"1901:02:01 00:00:00.000Z" will convert to "1901-02-01T00:00:00.000Z"</li>
|
||||
* <li>"2001:02:01 16:15+00:00" will convert to "2001-02-01T16:15+00:00"</li>
|
||||
* <li>"2021-06-11 05:36-01:00" will convert to "2021-06-11T05:36-01:00"</li>
|
||||
* </ul>
|
||||
* @param dateStr
|
||||
* @return dateStr in Iso8601 format
|
||||
*/
|
||||
protected String iptcToIso8601DateString(String dateStr)
|
||||
{
|
||||
char timeSeparator = 'T';
|
||||
Matcher yearMatcher = YEAR_IPTC.matcher(dateStr);
|
||||
if (yearMatcher.find())
|
||||
{
|
||||
String year = yearMatcher.group(1);
|
||||
dateStr = yearMatcher.replaceFirst(year.replaceAll(":", "-"));
|
||||
if (dateStr.length()>year.length() && dateStr.charAt(year.length())!=timeSeparator)
|
||||
{
|
||||
dateStr = dateStr.replace(dateStr.charAt(year.length()), timeSeparator);
|
||||
}
|
||||
}
|
||||
return dateStr;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.metadata.XMPDM;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.mp3.Mp3Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* MP3 file metadata extractor.
|
||||
*
|
||||
* Configuration: (see MP3MetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>songTitle:</b> -- cm:title
|
||||
* <b>albumTitle:</b> -- audio:album
|
||||
* <b>artist:</b> -- audio:artist, cm:author
|
||||
* <b>description:</b> -- cm:description
|
||||
* <b>comment:</b> --
|
||||
* <b>yearReleased:</b> -- audio:releaseDate
|
||||
* <b>trackNumber:</b> -- audio:trackNumber
|
||||
* <b>genre:</b> -- audio:genre
|
||||
* <b>composer:</b> -- audio:composer
|
||||
* <b>lyrics:</b> --
|
||||
* </pre>
|
||||
*
|
||||
* Note - XMPDM metadata keys are also emitted, in common with
|
||||
* the other Tika powered extracters
|
||||
*
|
||||
* Uses Apache Tika
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class MP3MetadataExtractor extends TikaAudioMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(MP3MetadataExtractor.class);
|
||||
|
||||
private static final String KEY_SONG_TITLE = "songTitle";
|
||||
private static final String KEY_ALBUM_TITLE = "albumTitle";
|
||||
private static final String KEY_ARTIST = "artist";
|
||||
private static final String KEY_COMMENT = "comment";
|
||||
private static final String KEY_YEAR_RELEASED = "yearReleased";
|
||||
private static final String KEY_TRACK_NUMBER = "trackNumber";
|
||||
private static final String KEY_GENRE = "genre";
|
||||
private static final String KEY_COMPOSER = "composer";
|
||||
|
||||
public MP3MetadataExtractor()
|
||||
{
|
||||
super(logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return new Mp3Parser();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
{
|
||||
// Do the normal Audio mappings
|
||||
super.extractSpecific(metadata, properties, headers);
|
||||
|
||||
// Now do the compatibility ones
|
||||
// We only need these for people who had pre-existing mapping
|
||||
// properties from before the proper audio model was added
|
||||
putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
|
||||
putRawValue(KEY_SONG_TITLE, metadata.get(TikaCoreProperties.TITLE), properties);
|
||||
putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
|
||||
putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties);
|
||||
putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties);
|
||||
putRawValue(KEY_GENRE, metadata.get(XMPDM.GENRE), properties);
|
||||
putRawValue(KEY_YEAR_RELEASED, metadata.get(XMPDM.RELEASE_DATE), properties);
|
||||
putRawValue(KEY_COMPOSER, metadata.get(XMPDM.COMPOSER), properties);
|
||||
|
||||
// All done
|
||||
return properties;
|
||||
}
|
||||
}
|
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Message;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* Outlook MAPI format email metadata extractor.
|
||||
*
|
||||
* Configuration: (see MailMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>sentDate:</b> -- cm:sentdate
|
||||
* <b>originator:</b> -- cm:originator, cm:author
|
||||
* <b>addressee:</b> -- cm:addressee
|
||||
* <b>addressees:</b> -- cm:addressees
|
||||
* <b>subjectLine:</b> -- cm:subjectline, cm:description
|
||||
* <b>toNames:</b> --
|
||||
* <b>ccNames:</b> --
|
||||
* <b>bccNames:</b> --
|
||||
* </pre>
|
||||
*
|
||||
* TIKA note - to/cc/bcc go into the html part, not the metadata.
|
||||
* Also, email addresses not included as yet.
|
||||
*
|
||||
* @author Kevin Roast
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class MailMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(MailMetadataExtractor.class);
|
||||
|
||||
private static final String KEY_SENT_DATE = "sentDate";
|
||||
private static final String KEY_ORIGINATOR = "originator";
|
||||
private static final String KEY_ADDRESSEE = "addressee";
|
||||
private static final String KEY_ADDRESSEES = "addressees";
|
||||
private static final String KEY_SUBJECT = "subjectLine";
|
||||
private static final String KEY_TO_NAMES = "toNames";
|
||||
private static final String KEY_CC_NAMES = "ccNames";
|
||||
private static final String KEY_BCC_NAMES = "bccNames";
|
||||
|
||||
public MailMetadataExtractor()
|
||||
{
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
// The office parser does Outlook as well as Word, Excel etc
|
||||
return new OfficeParser();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
{
|
||||
putRawValue(KEY_ORIGINATOR, metadata.get(TikaCoreProperties.CREATOR), properties);
|
||||
putRawValue(KEY_SUBJECT, metadata.get(TikaCoreProperties.TITLE), properties);
|
||||
putRawValue(KEY_DESCRIPTION, metadata.get(TikaCoreProperties.SUBJECT), properties);
|
||||
putRawValue(KEY_SENT_DATE, metadata.get(TikaCoreProperties.MODIFIED), properties);
|
||||
|
||||
// Store the TO, but not cc/bcc in the addressee field
|
||||
putRawValue(KEY_ADDRESSEE, metadata.get(Message.MESSAGE_TO), properties);
|
||||
|
||||
// Store each of To, CC and BCC in their own fields
|
||||
putRawValue(KEY_TO_NAMES, metadata.getValues(Message.MESSAGE_TO), properties);
|
||||
putRawValue(KEY_CC_NAMES, metadata.getValues(Message.MESSAGE_CC), properties);
|
||||
putRawValue(KEY_BCC_NAMES, metadata.getValues(Message.MESSAGE_BCC), properties);
|
||||
|
||||
// But store all email addresses (to/cc/bcc) in the addresses field
|
||||
putRawValue(KEY_ADDRESSEES, metadata.getValues(Message.MESSAGE_RECIPIENT_ADDRESS), properties);
|
||||
|
||||
return properties;
|
||||
}
|
||||
}
|
@@ -0,0 +1,116 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Office;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* Office file format metadata extractor.
|
||||
*
|
||||
* Configuration: (see OfficeMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* This extractor uses the POI library to extract the following:
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>subject:</b> -- cm:description
|
||||
* <b>createDateTime:</b> -- cm:created
|
||||
* <b>lastSaveDateTime:</b> -- cm:modified
|
||||
* <b>comments:</b>
|
||||
* <b>editTime:</b>
|
||||
* <b>format:</b>
|
||||
* <b>keywords:</b>
|
||||
* <b>lastAuthor:</b>
|
||||
* <b>lastPrinted:</b>
|
||||
* <b>osVersion:</b>
|
||||
* <b>thumbnail:</b>
|
||||
* <b>pageCount:</b>
|
||||
* <b>wordCount:</b>
|
||||
* </pre>
|
||||
*
|
||||
* Uses Apache Tika
|
||||
*
|
||||
* @author Derek Hulley
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(OfficeMetadataExtractor.class);
|
||||
|
||||
public static final String KEY_CREATE_DATETIME = "createDateTime";
|
||||
public static final String KEY_LAST_SAVE_DATETIME = "lastSaveDateTime";
|
||||
public static final String KEY_EDIT_TIME = "editTime";
|
||||
public static final String KEY_FORMAT = "format";
|
||||
public static final String KEY_KEYWORDS = "keywords";
|
||||
public static final String KEY_LAST_AUTHOR = "lastAuthor";
|
||||
public static final String KEY_LAST_PRINTED = "lastPrinted";
|
||||
public static final String KEY_PAGE_COUNT = "pageCount";
|
||||
public static final String KEY_PARAGRAPH_COUNT = "paragraphCount";
|
||||
public static final String KEY_WORD_COUNT = "wordCount";
|
||||
|
||||
public OfficeMetadataExtractor()
|
||||
{
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return new OfficeParser();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
{
|
||||
putRawValue(KEY_CREATE_DATETIME, metadata.get(TikaCoreProperties.CREATED), properties);
|
||||
putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(TikaCoreProperties.MODIFIED), properties);
|
||||
putRawValue(KEY_EDIT_TIME, metadata.get(TikaCoreProperties.MODIFIED), properties);
|
||||
putRawValue(KEY_FORMAT, metadata.get(TikaCoreProperties.FORMAT), properties);
|
||||
putRawValue(KEY_KEYWORDS, metadata.get(TikaCoreProperties.SUBJECT), properties);
|
||||
putRawValue(KEY_LAST_AUTHOR, metadata.get(TikaCoreProperties.MODIFIER), properties);
|
||||
putRawValue(KEY_LAST_PRINTED, metadata.get(TikaCoreProperties.PRINT_DATE), properties);
|
||||
putRawValue(KEY_PAGE_COUNT, metadata.get(Office.PAGE_COUNT), properties);
|
||||
putRawValue(KEY_PARAGRAPH_COUNT, metadata.get(Office.PARAGRAPH_COUNT), properties);
|
||||
putRawValue(KEY_WORD_COUNT, metadata.get(Office.WORD_COUNT), properties);
|
||||
return properties;
|
||||
}
|
||||
}
|
@@ -0,0 +1,174 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.odf.OpenDocumentMetaParser;
|
||||
import org.apache.tika.parser.odf.OpenDocumentParser;
|
||||
import org.apache.tika.parser.xml.ElementMetadataHandler;
|
||||
import org.apache.tika.sax.TeeContentHandler;
|
||||
import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.xml.sax.ContentHandler;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
import static org.apache.tika.metadata.DublinCore.NAMESPACE_URI_DC;
|
||||
|
||||
/**
|
||||
* {@code "application/vnd.oasis.opendocument..."} and {@code "applicationvnd.oasis.opendocument..."} metadata extractor.
|
||||
*
|
||||
* Configuration: (see OpenDocumentMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>creationDate:</b> -- cm:created
|
||||
* <b>creator:</b> -- cm:author
|
||||
* <b>date:</b>
|
||||
* <b>description:</b> -- cm:description
|
||||
* <b>generator:</b>
|
||||
* <b>initialCreator:</b>
|
||||
* <b>keyword:</b>
|
||||
* <b>language:</b>
|
||||
* <b>printDate:</b>
|
||||
* <b>printedBy:</b>
|
||||
* <b>subject:</b>
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>All user properties</b>
|
||||
* </pre>
|
||||
*
|
||||
* Uses Apache Tika
|
||||
*
|
||||
* @author Antti Jokipii
|
||||
* @author Derek Hulley
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(OpenDocumentMetadataExtractor.class);
|
||||
|
||||
private static final String KEY_CREATION_DATE = "creationDate";
|
||||
private static final String KEY_CREATOR = "creator";
|
||||
private static final String KEY_DATE = "date";
|
||||
private static final String KEY_GENERATOR = "generator";
|
||||
private static final String KEY_INITIAL_CREATOR = "initialCreator";
|
||||
private static final String KEY_KEYWORD = "keyword";
|
||||
private static final String KEY_LANGUAGE = "language";
|
||||
private static final String KEY_ALFRESCO_CREATOR = "_alfresco:creator";
|
||||
|
||||
private static final String CUSTOM_PREFIX = "custom:";
|
||||
|
||||
private static final DateTimeFormatter dateFormatter = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss");
|
||||
|
||||
public OpenDocumentMetadataExtractor()
|
||||
{
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
OpenDocumentParser parser = new OpenDocumentParser();
|
||||
parser.setMetaParser(new OpenDocumentMetaParser() {
|
||||
@Override
|
||||
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context)
|
||||
{
|
||||
final ContentHandler superHandler = super.getContentHandler(ch, md, context);
|
||||
final ContentHandler creatorHandler = new ElementMetadataHandler(NAMESPACE_URI_DC, KEY_CREATOR, md, KEY_ALFRESCO_CREATOR);
|
||||
return new TeeContentHandler(superHandler, creatorHandler);
|
||||
}
|
||||
});
|
||||
return parser;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String, String> headers)
|
||||
{
|
||||
putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(TikaCoreProperties.CREATED)), properties);
|
||||
final String creator = getCreator(metadata);
|
||||
putRawValue(KEY_CREATOR, creator, properties);
|
||||
putRawValue(KEY_AUTHOR, creator, properties);
|
||||
putRawValue(KEY_DATE, getDateOrNull(metadata.get(TikaCoreProperties.MODIFIED)), properties);
|
||||
putRawValue(KEY_DESCRIPTION, metadata.get(TikaCoreProperties.DESCRIPTION), properties);
|
||||
putRawValue(KEY_GENERATOR, metadata.get("generator"), properties);
|
||||
putRawValue(KEY_INITIAL_CREATOR, metadata.get("initial-creator"), properties);
|
||||
putRawValue(KEY_KEYWORD, metadata.get(TikaCoreProperties.SUBJECT), properties);
|
||||
putRawValue(KEY_LANGUAGE, metadata.get(TikaCoreProperties.LANGUAGE), properties);
|
||||
|
||||
// Handle user-defined properties dynamically
|
||||
Map<String, Set<String>> mapping = super.getExtractMapping();
|
||||
for (String key : mapping.keySet())
|
||||
{
|
||||
if (metadata.get(CUSTOM_PREFIX + key) != null)
|
||||
{
|
||||
putRawValue(key, metadata.get(CUSTOM_PREFIX + key), properties);
|
||||
}
|
||||
}
|
||||
|
||||
return properties;
|
||||
}
|
||||
|
||||
private String getCreator(Metadata metadata)
|
||||
{
|
||||
final List<String> creators = distinct(metadata.getValues(TikaCoreProperties.CREATOR))
|
||||
.collect(Collectors.toUnmodifiableList());
|
||||
if (creators.size() == 1)
|
||||
{
|
||||
return creators.get(0);
|
||||
}
|
||||
|
||||
return metadata.get(KEY_ALFRESCO_CREATOR);
|
||||
}
|
||||
|
||||
private Date getDateOrNull(String dateString)
|
||||
{
|
||||
if (dateString != null && dateString.length() != 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
return dateFormatter.parseDateTime(dateString).toDate();
|
||||
}
|
||||
catch (IllegalArgumentException ignore)
|
||||
{
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.alfresco.transform.tika.transformers.Tika;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* Metadata extractor for the PDF documents.
|
||||
*
|
||||
* Configuration: (see PdfBoxMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>subject:</b> -- cm:description
|
||||
* <b>created:</b> -- cm:created
|
||||
* </pre>
|
||||
*
|
||||
* Uses Apache Tika
|
||||
*
|
||||
* @author Jesper Steen Møller
|
||||
* @author Derek Hulley
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class PdfBoxMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(PdfBoxMetadataExtractor.class);
|
||||
|
||||
public PdfBoxMetadataExtractor()
|
||||
{
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DocumentSelector getDocumentSelector(Metadata metadata, String targetMimeType)
|
||||
{
|
||||
return Tika.pdfBoxEmbededDocumentSelector;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return new PDFParser();
|
||||
}
|
||||
}
|
@@ -0,0 +1,179 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.poi.ooxml.POIXMLProperties;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.apache.tika.embedder.Embedder;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* POI-based metadata extractor for Office 07 documents. See http://poi.apache.org/ for information on POI.
|
||||
*
|
||||
* Configuration: (see PoiMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>subject:</b> -- cm:description
|
||||
* <b>created:</b> -- cm:created
|
||||
* <b>Any custom property:</b> -- [not mapped]
|
||||
* </pre>
|
||||
*
|
||||
* Uses Apache Tika
|
||||
*
|
||||
* Also includes a sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add
|
||||
* metadata. This is not production code so no supported mimetypes exist in the {@code tika_engine_config.json}.
|
||||
* Adding the following would make it available:
|
||||
*
|
||||
* <pre>
|
||||
* {
|
||||
* "transformOptions": {
|
||||
* ...
|
||||
* "metadataEmbedOptions": [
|
||||
* {"value": {"name": "metadata", "required": true}}
|
||||
* ]
|
||||
* },
|
||||
* "transformers": [
|
||||
* ...
|
||||
* {
|
||||
* "transformerName": "SamplePoiMetadataEmbedder",
|
||||
* "supportedSourceAndTargetList": [
|
||||
* ...
|
||||
* {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
|
||||
* ],
|
||||
* "transformOptions": [
|
||||
* "metadataEmbedOptions"
|
||||
* ]
|
||||
* }
|
||||
* ]
|
||||
* }
|
||||
* </pre>
|
||||
|
||||
* @author Nick Burch
|
||||
* @author Neil McErlean
|
||||
* @author Dmitry Velichkevich
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class PoiMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(PoiMetadataExtractor.class);
|
||||
|
||||
public PoiMetadataExtractor()
|
||||
{
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return new OOXMLParser();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Embedder getEmbedder()
|
||||
{
|
||||
return new SamplePoiEmbedder();
|
||||
}
|
||||
|
||||
private static class SamplePoiEmbedder implements Embedder
|
||||
{
|
||||
private static final Set<MediaType> SUPPORTED_EMBED_TYPES =
|
||||
Collections.singleton(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
|
||||
|
||||
@Override
|
||||
public Set<MediaType> getSupportedEmbedTypes(ParseContext parseContext)
|
||||
{
|
||||
return SUPPORTED_EMBED_TYPES;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void embed(Metadata metadata, InputStream inputStream, OutputStream outputStream, ParseContext parseContext)
|
||||
throws IOException
|
||||
{
|
||||
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
|
||||
POIXMLProperties props = workbook.getProperties();
|
||||
|
||||
POIXMLProperties.CoreProperties coreProp = props.getCoreProperties();
|
||||
POIXMLProperties.CustomProperties custProp = props.getCustomProperties();
|
||||
|
||||
for (String name : metadata.names())
|
||||
{
|
||||
metadata.isMultiValued("description");
|
||||
String value = null;
|
||||
if (metadata.isMultiValued(name))
|
||||
{
|
||||
String[] values = metadata.getValues(name);
|
||||
StringJoiner sj = new StringJoiner(", ");
|
||||
for (String s : values)
|
||||
{
|
||||
sj.add(s);
|
||||
}
|
||||
value = sj.toString();
|
||||
}
|
||||
else
|
||||
{
|
||||
value = metadata.get(name);
|
||||
}
|
||||
switch (name)
|
||||
{
|
||||
case "author":
|
||||
coreProp.setCreator(value);
|
||||
break;
|
||||
case "title":
|
||||
coreProp.setTitle(value);
|
||||
break;
|
||||
case "description":
|
||||
coreProp.setDescription(value);
|
||||
break;
|
||||
// There are other core values but this is sample code, so we will assume it is a custom value.
|
||||
default:
|
||||
custProp.addProperty(name, value);
|
||||
break;
|
||||
}
|
||||
}
|
||||
workbook.write(outputStream);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,178 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.metadata.XMPDM;
|
||||
import org.apache.tika.parser.CompositeParser;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.mp4.MP4Parser;
|
||||
import org.gagravarr.tika.FlacParser;
|
||||
import org.gagravarr.tika.VorbisParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Calendar;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
|
||||
|
||||
/**
|
||||
* A Metadata Extractor which makes use of the Apache Tika Audio Parsers to extract metadata from media files.
|
||||
* For backwards compatibility reasons, this doesn't handle the MP3 format, which has its own dedicated extractor
|
||||
* in {@link MP3MetadataExtractor}
|
||||
*
|
||||
* Configuration: (see TikaAudioMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>created:</b> -- cm:created
|
||||
* <b>xmpDM:artist</b> -- audio:artist
|
||||
* <b>xmpDM:composer</b> -- audio:composer
|
||||
* <b>xmpDM:engineer</b> -- audio:engineer
|
||||
* <b>xmpDM:genre</b> -- audio:genre
|
||||
* <b>xmpDM:trackNumber</b> -- audio:trackNumber
|
||||
* <b>xmpDM:releaseDate</b> -- audio:releaseDate
|
||||
* </pre>
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TikaAudioMetadataExtractor.class);
|
||||
|
||||
// The Audio related parsers we use
|
||||
private static final Parser[] parsers = new Parser[] {
|
||||
new VorbisParser(),
|
||||
new FlacParser(),
|
||||
new MP4Parser()
|
||||
};
|
||||
|
||||
protected final TikaConfig tikaConfig;
|
||||
|
||||
public TikaAudioMetadataExtractor()
|
||||
{
|
||||
this(logger);
|
||||
}
|
||||
|
||||
public TikaAudioMetadataExtractor(Logger logger)
|
||||
{
|
||||
super(EXTRACTOR, logger);
|
||||
tikaConfig = readTikaConfig(logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return new CompositeParser(tikaConfig.getMediaTypeRegistry(), parsers);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
{
|
||||
// Most things can go with the default Tika -> Alfresco Mapping
|
||||
// Handle the few special cases here
|
||||
|
||||
// The description is special
|
||||
putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
|
||||
|
||||
// The release date can be fiddly
|
||||
Serializable releaseDate = generateReleaseDate(metadata);
|
||||
putRawValue(KEY_CREATED, releaseDate, properties);
|
||||
putRawValue(XMPDM.RELEASE_DATE.getName(), releaseDate, properties);
|
||||
|
||||
return properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates the release date
|
||||
*/
|
||||
private Serializable generateReleaseDate(Metadata metadata)
|
||||
{
|
||||
String date = metadata.get(XMPDM.RELEASE_DATE);
|
||||
if(date == null || date.length() == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Is it just a year?
|
||||
if(date.matches("\\d\\d\\d\\d"))
|
||||
{
|
||||
// Just a year, we need a full date
|
||||
// Go for the 1st of the 1st
|
||||
Calendar c = Calendar.getInstance();
|
||||
c.set(
|
||||
Integer.parseInt(date), Calendar.JANUARY, 1,
|
||||
0, 0, 0
|
||||
);
|
||||
c.set(Calendar.MILLISECOND, 0);
|
||||
return c.getTime();
|
||||
}
|
||||
|
||||
// Treat as a normal date
|
||||
return makeDate(date);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate the description
|
||||
*
|
||||
* @param metadata the metadata extracted from the file
|
||||
* @return the description
|
||||
*/
|
||||
private String generateDescription(Metadata metadata)
|
||||
{
|
||||
StringBuilder result = new StringBuilder();
|
||||
if (metadata.get(TikaCoreProperties.TITLE) != null)
|
||||
{
|
||||
result.append(metadata.get(TikaCoreProperties.TITLE));
|
||||
if (metadata.get(XMPDM.ALBUM) != null)
|
||||
{
|
||||
result
|
||||
.append(" - ")
|
||||
.append(metadata.get(XMPDM.ALBUM));
|
||||
}
|
||||
if (metadata.get(XMPDM.ARTIST) != null)
|
||||
{
|
||||
result
|
||||
.append(" (")
|
||||
.append(metadata.get(XMPDM.ARTIST))
|
||||
.append(")");
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
}
|
@@ -0,0 +1,147 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TIFF;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
|
||||
|
||||
/**
|
||||
* A Metadata Extractor which makes use of the Apache Tika auto-detection to select the best parser to extract the
|
||||
* metadata from a document. This will be used for all files which Tika can handle, but where no other more explicit
|
||||
* extractor is defined.
|
||||
*
|
||||
* Configuration: (see TikaAutoMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>subject:</b> -- cm:description
|
||||
* <b>created:</b> -- cm:created
|
||||
* <b>comments:</b>
|
||||
* <b>geo:lat:</b> -- cm:latitude
|
||||
* <b>geo:long:</b> -- cm:longitude
|
||||
* </pre>
|
||||
*
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TikaAutoMetadataExtractor.class);
|
||||
|
||||
private static final String EXIF_IMAGE_HEIGHT_TAG = "Exif Image Height";
|
||||
private static final String EXIF_IMAGE_WIDTH_TAG = "Exif Image Width";
|
||||
private static final String JPEG_IMAGE_HEIGHT_TAG = "Image Height";
|
||||
private static final String JPEG_IMAGE_WIDTH_TAG = "Image Width";
|
||||
private static final String COMPRESSION_TAG = "Compression";
|
||||
|
||||
protected final TikaConfig tikaConfig;
|
||||
|
||||
public TikaAutoMetadataExtractor()
|
||||
{
|
||||
super(EXTRACTOR, logger);
|
||||
tikaConfig = readTikaConfig(logger);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does auto-detection to select the best Tika Parser.
|
||||
*/
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return new AutoDetectParser(tikaConfig);
|
||||
}
|
||||
|
||||
/**
|
||||
* Because some editors use JPEG_IMAGE_HEIGHT_TAG when
|
||||
* saving JPEG images , a more reliable source for
|
||||
* image size are the values provided by Tika
|
||||
* and not the exif/tiff metadata read from the file
|
||||
* This will override the tiff:Image size
|
||||
* which gets embedded into the alfresco node properties
|
||||
* for jpeg files that contain such exif information
|
||||
*/
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
{
|
||||
if (MIMETYPE_IMAGE_JPEG.equals(metadata.get(Metadata.CONTENT_TYPE)))
|
||||
{
|
||||
//check if the image has exif information
|
||||
if (metadata.get(EXIF_IMAGE_WIDTH_TAG) != null
|
||||
&& metadata.get(EXIF_IMAGE_HEIGHT_TAG) != null
|
||||
&& metadata.get(COMPRESSION_TAG) != null)
|
||||
{
|
||||
//replace the exif size properties that will be embedded in the node with
|
||||
//the guessed dimensions from Tika
|
||||
putRawValue(TIFF.IMAGE_LENGTH.getName(), extractSize(metadata.get(EXIF_IMAGE_HEIGHT_TAG)), properties);
|
||||
putRawValue(TIFF.IMAGE_WIDTH.getName(), extractSize(metadata.get(EXIF_IMAGE_WIDTH_TAG)), properties);
|
||||
putRawValue(JPEG_IMAGE_HEIGHT_TAG, metadata.get(EXIF_IMAGE_HEIGHT_TAG), properties);
|
||||
putRawValue(JPEG_IMAGE_WIDTH_TAG, metadata.get(EXIF_IMAGE_WIDTH_TAG), properties);
|
||||
}
|
||||
}
|
||||
return properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* Exif metadata for size also returns the string "pixels"
|
||||
* after the number value , this function will
|
||||
* stop at the first non digit character found in the text
|
||||
* @param sizeText string text
|
||||
* @return the size value
|
||||
*/
|
||||
private String extractSize(String sizeText)
|
||||
{
|
||||
StringBuilder sizeValue = new StringBuilder();
|
||||
for(char c : sizeText.toCharArray())
|
||||
{
|
||||
if(Character.isDigit(c))
|
||||
{
|
||||
sizeValue.append(c);
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
return sizeValue.toString();
|
||||
}
|
||||
}
|
@@ -0,0 +1,372 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.parsers;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_TIFF;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Reader;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.output.NullOutputStream;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.io.TemporaryResources;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.external.ExternalParser;
|
||||
import org.apache.tika.parser.external.ExternalParsersFactory;
|
||||
import org.apache.tika.parser.image.ImageParser;
|
||||
import org.apache.tika.parser.image.TiffParser;
|
||||
import org.apache.tika.parser.image.JpegParser;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
public class ExifToolParser extends ExternalParser {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ExifToolParser.class);
|
||||
|
||||
private static final String EXIFTOOL_PARSER_CONFIG = "parsers/external/config/exiftool-parser.xml";
|
||||
|
||||
protected static final String DEFAULT_SEPARATOR = ", ";
|
||||
protected static final String SEPARATOR_SETTING = "-sep";
|
||||
|
||||
private String separator;
|
||||
|
||||
public ExifToolParser() {
|
||||
super();
|
||||
try {
|
||||
List<ExternalParser> eParsers = ExternalParsersFactory.create(getExternalParserConfigURL());
|
||||
// if ExifTool is not installed then no parsers are returned
|
||||
if (eParsers.size() > 0) {
|
||||
ExternalParser eParser = eParsers.get(0);
|
||||
this.setCommand(eParser.getCommand());
|
||||
this.setIgnoredLineConsumer(eParser.getIgnoredLineConsumer());
|
||||
this.setMetadataExtractionPatterns(eParser.getMetadataExtractionPatterns());
|
||||
this.setSupportedTypes(eParser.getSupportedTypes());
|
||||
} else {
|
||||
logger.error(
|
||||
"Error creating ExifToolParser from config, ExifToolExtractions not enabled. Please check ExifTool is installed correctly.");
|
||||
}
|
||||
} catch (IOException | TikaException e) {
|
||||
logger.error("Error creating ExifToolParser from config, ExifToolExtractions not enabled: ", e);
|
||||
}
|
||||
}
|
||||
|
||||
private URL getExternalParserConfigURL(){
|
||||
ClassLoader classLoader = ExifToolParser.class.getClassLoader();
|
||||
return classLoader.getResource(EXIFTOOL_PARSER_CONFIG);
|
||||
}
|
||||
|
||||
public void setSeparator(String sep) {
|
||||
this.separator = sep;
|
||||
}
|
||||
|
||||
public String getSeparator() {
|
||||
return this.separator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCommand(String... command){
|
||||
super.setCommand(command);
|
||||
if (command.length==1) {
|
||||
setSeparator(findSeparator(command[0]));
|
||||
}
|
||||
else {
|
||||
setSeparator(DEFAULT_SEPARATOR);
|
||||
}
|
||||
}
|
||||
|
||||
protected String findSeparator(String command) {
|
||||
if (command.contains(SEPARATOR_SETTING)) {
|
||||
int start = command.indexOf(SEPARATOR_SETTING)+SEPARATOR_SETTING.length()+1;
|
||||
String separator = DEFAULT_SEPARATOR;
|
||||
if (command.charAt(start)=='\"') {
|
||||
//get all chars up to the next \"
|
||||
int end = command.indexOf("\"", start+1);
|
||||
separator = command.substring(start+1, end);
|
||||
}
|
||||
else {
|
||||
int end = command.indexOf(" ", start);
|
||||
separator = command.substring(start, end);
|
||||
}
|
||||
return separator;
|
||||
}
|
||||
return DEFAULT_SEPARATOR;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser}
|
||||
* due to errors attempting to {@link #extractMetadata} from the errorStream in original implementation. <p>
|
||||
* Executes the configured external command and passes the given document
|
||||
* stream as a simple XHTML document to the given SAX content handler.
|
||||
* Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
|
||||
* has been called to set patterns.
|
||||
*/
|
||||
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
|
||||
throws IOException, SAXException, TikaException {
|
||||
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
|
||||
|
||||
MediaType mediaType = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
|
||||
TemporaryResources tmp = new TemporaryResources();
|
||||
try {
|
||||
TikaInputStream tis = TikaInputStream.get(stream, tmp);
|
||||
if (this.getSupportedTypes().contains(mediaType)) {
|
||||
parse(tis, xhtml, metadata, tmp);
|
||||
}
|
||||
switch (mediaType.getType()+"/"+mediaType.getSubtype()) {
|
||||
case MIMETYPE_IMAGE_JPEG:
|
||||
parseAdditional(new JpegParser(), tis, handler, metadata, context, mediaType);
|
||||
break;
|
||||
case MIMETYPE_IMAGE_TIFF:
|
||||
parseAdditional(new TiffParser(), tis, handler, metadata, context, mediaType);
|
||||
break;
|
||||
default:
|
||||
parseAdditional(new ImageParser(), tis, handler, metadata, context, mediaType);
|
||||
}
|
||||
} finally {
|
||||
tmp.dispose();
|
||||
}
|
||||
}
|
||||
|
||||
private void parseAdditional(Parser parser, TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context,
|
||||
MediaType mediaType) throws IOException, SAXException, TikaException {
|
||||
if (parser.getSupportedTypes(context).contains(mediaType)) {
|
||||
parser.parse(tis, handler, metadata, context);
|
||||
}
|
||||
}
|
||||
|
||||
private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, TemporaryResources tmp)
|
||||
throws IOException, SAXException, TikaException {
|
||||
boolean inputToStdIn = true;
|
||||
boolean outputFromStdOut = true;
|
||||
boolean hasPatterns = (getMetadataExtractionPatterns() != null && !getMetadataExtractionPatterns().isEmpty());
|
||||
|
||||
File output = null;
|
||||
|
||||
// Build our getCommand()
|
||||
String[] cmd;
|
||||
if (getCommand().length == 1) {
|
||||
cmd = getCommand()[0].split(" ");
|
||||
} else {
|
||||
cmd = new String[getCommand().length];
|
||||
System.arraycopy(getCommand(), 0, cmd, 0, getCommand().length);
|
||||
}
|
||||
for (int i = 0; i < cmd.length; i++) {
|
||||
if (cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
|
||||
cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, stream.getFile().getPath());
|
||||
inputToStdIn = false;
|
||||
}
|
||||
if (cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
|
||||
output = tmp.createTemporaryFile();
|
||||
outputFromStdOut = false;
|
||||
cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
|
||||
}
|
||||
}
|
||||
|
||||
// Execute
|
||||
Process process = null;
|
||||
try {
|
||||
if (cmd.length == 1) {
|
||||
process = Runtime.getRuntime().exec(cmd[0]);
|
||||
} else {
|
||||
process = Runtime.getRuntime().exec(cmd);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
try {
|
||||
if (inputToStdIn) {
|
||||
sendInput(process, stream);
|
||||
} else {
|
||||
process.getOutputStream().close();
|
||||
}
|
||||
|
||||
InputStream out = process.getInputStream();
|
||||
InputStream err = process.getErrorStream();
|
||||
|
||||
if (hasPatterns) {
|
||||
|
||||
if (outputFromStdOut) {
|
||||
extractOutput(out, xhtml);
|
||||
} else {
|
||||
extractMetadata(out, metadata);
|
||||
}
|
||||
} else {
|
||||
ignoreStream(err);
|
||||
|
||||
if (outputFromStdOut) {
|
||||
extractOutput(out, xhtml);
|
||||
} else {
|
||||
ignoreStream(out);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
try {
|
||||
process.waitFor();
|
||||
} catch (InterruptedException ignore) {
|
||||
}
|
||||
}
|
||||
|
||||
// Grab the output if we haven't already
|
||||
if (!outputFromStdOut) {
|
||||
extractOutput(new FileInputStream(output), xhtml);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser}<p>
|
||||
* Starts a thread that extracts the contents of the standard output
|
||||
* stream of the given process to the given XHTML content handler.
|
||||
* The standard output stream is closed once fully processed.
|
||||
*
|
||||
* @param stream stream
|
||||
* @param xhtml XHTML content handler
|
||||
* @throws SAXException if the XHTML SAX events could not be handled
|
||||
* @throws IOException if an input error occurred
|
||||
*/
|
||||
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
|
||||
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
|
||||
xhtml.startDocument();
|
||||
xhtml.startElement("p");
|
||||
char[] buffer = new char[1024];
|
||||
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
|
||||
xhtml.characters(buffer, 0, n);
|
||||
}
|
||||
xhtml.endElement("p");
|
||||
xhtml.endDocument();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser}<p>
|
||||
* Starts a thread that sends the contents of the given input stream
|
||||
* to the standard input stream of the given process. Potential
|
||||
* exceptions are ignored, and the standard input stream is closed
|
||||
* once fully processed. Note that the given input stream is <em>not</em>
|
||||
* closed by this method.
|
||||
*
|
||||
* @param process process
|
||||
* @param stream input stream
|
||||
*/
|
||||
private void sendInput(final Process process, final InputStream stream) {
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
OutputStream stdin = process.getOutputStream();
|
||||
try {
|
||||
IOUtils.copy(stream, stdin);
|
||||
} catch (IOException e) {
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
try {
|
||||
t.join();
|
||||
} catch (InterruptedException ignore) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapted from {@link org.apache.tika.parser.external.ExternalParser}<p>
|
||||
* Starts a thread that reads and discards the contents of the
|
||||
* standard stream of the given process. Potential exceptions
|
||||
* are ignored, and the stream is closed once fully processed.
|
||||
*
|
||||
* @param stream stream
|
||||
*/
|
||||
private void ignoreStream(final InputStream stream) {
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
try {
|
||||
IOUtils.copy(stream, NullOutputStream.NULL_OUTPUT_STREAM);
|
||||
} catch (IOException e) {
|
||||
} finally {
|
||||
IOUtils.closeQuietly(stream);
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
try {
|
||||
t.join();
|
||||
} catch (InterruptedException ignore) {
|
||||
}
|
||||
}
|
||||
|
||||
private void extractMetadata(final InputStream stream, final Metadata metadata) {
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
BufferedReader reader;
|
||||
reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
|
||||
try {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
for (Pattern p : getMetadataExtractionPatterns().keySet()) {
|
||||
Matcher m = p.matcher(line);
|
||||
if (m.find()) {
|
||||
if (getMetadataExtractionPatterns().get(p) != null
|
||||
&& !getMetadataExtractionPatterns().get(p).equals("")) {
|
||||
metadata.add(getMetadataExtractionPatterns().get(p), m.group(1));
|
||||
} else {
|
||||
metadata.add(m.group(1), m.group(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// Ignore
|
||||
} finally {
|
||||
IOUtils.closeQuietly(reader);
|
||||
IOUtils.closeQuietly(stream);
|
||||
}
|
||||
}
|
||||
};
|
||||
t.start();
|
||||
try {
|
||||
t.join();
|
||||
} catch (InterruptedException ignore) {
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.parsers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.PushbackInputStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.poi.poifs.common.POIFSConstants;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
///////// THIS FILE WAS A COPY OF THE CODE IN alfresco-repository /////////////
|
||||
|
||||
/**
|
||||
* <a href="http://tika.apache.org/Apache Tika">Apache Tika</a> assumes that
|
||||
* you either know exactly what your content is, or that
|
||||
* you'll leave it to auto-detection.
|
||||
* Within Alfresco, we usually do know. However, from time
|
||||
* to time, we don't know if we have one of the old or one
|
||||
* of the new office files (eg .xls and .xlsx).
|
||||
* This class allows automatically selects the appropriate
|
||||
* old (OLE2) or new (OOXML) Tika parser as required.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class TikaOfficeDetectParser implements Parser
|
||||
{
|
||||
private final Parser ole2Parser = new OfficeParser();
|
||||
private final Parser ooxmlParser = new OOXMLParser();
|
||||
|
||||
public Set<MediaType> getSupportedTypes(ParseContext parseContext)
|
||||
{
|
||||
Set<MediaType> types = new HashSet<>();
|
||||
types.addAll(ole2Parser.getSupportedTypes(parseContext));
|
||||
types.addAll(ooxmlParser.getSupportedTypes(parseContext));
|
||||
return types;
|
||||
}
|
||||
|
||||
public void parse(InputStream stream,
|
||||
ContentHandler handler, Metadata metadata,
|
||||
ParseContext parseContext) throws IOException, SAXException,
|
||||
TikaException
|
||||
{
|
||||
byte[] initial4 = new byte[4];
|
||||
InputStream wrapped;
|
||||
// Preserve TikaInputStreams as TikaInputStreams as they require less memory to process
|
||||
if (stream.markSupported())
|
||||
{
|
||||
stream.mark(initial4.length);
|
||||
IOUtils.readFully(stream, initial4);
|
||||
stream.reset();
|
||||
wrapped = stream;
|
||||
}
|
||||
else
|
||||
{
|
||||
PushbackInputStream inp = new PushbackInputStream(stream, 4);
|
||||
IOUtils.readFully(inp, initial4);
|
||||
inp.unread(initial4);
|
||||
wrapped = inp;
|
||||
}
|
||||
|
||||
// Which is it?
|
||||
if (initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
|
||||
initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
|
||||
initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
|
||||
initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3])
|
||||
{
|
||||
ooxmlParser.parse(wrapped, handler, metadata, parseContext);
|
||||
}
|
||||
else
|
||||
{
|
||||
ole2Parser.parse(wrapped, handler, metadata, parseContext);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated This method will be removed in Apache Tika 1.0.
|
||||
*/
|
||||
public void parse(InputStream stream,
|
||||
ContentHandler handler, Metadata metadata)
|
||||
throws IOException, SAXException, TikaException
|
||||
{
|
||||
parse(stream, handler, metadata, new ParseContext());
|
||||
}
|
||||
}
|
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.PostConstruct;
|
||||
|
||||
@Component
|
||||
public class ArchiveTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.packageParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.alfresco.transform.base.CustomTransformer;
|
||||
import org.alfresco.transform.base.logging.LogEntry;
|
||||
import org.alfresco.transform.common.RequestParamMap;
|
||||
import org.alfresco.transform.common.TransformException;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import static java.lang.Boolean.parseBoolean;
|
||||
|
||||
public abstract class GenericTikaTransformer implements CustomTransformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(GenericTikaTransformer.class);
|
||||
|
||||
@Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}")
|
||||
boolean notExtractBookmarksTextDefault;
|
||||
|
||||
@Autowired
|
||||
protected Tika tika;
|
||||
|
||||
protected abstract Parser getParser();
|
||||
|
||||
protected DocumentSelector getDocumentSelector()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTransformerName()
|
||||
{
|
||||
String simpleClassName = getClass().getSimpleName();
|
||||
return simpleClassName.substring(0, simpleClassName.length()-"Transformer".length());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transform(String sourceMimetype, String sourceEncoding, InputStream inputStream,
|
||||
String targetMimetype, String targetEncoding, OutputStream outputStream,
|
||||
Map<String, String> transformOptions) throws Exception
|
||||
{
|
||||
// TODO
|
||||
throw new TransformException(500, "TODO GenericTikaTransformer transform with InputStreams");
|
||||
}
|
||||
|
||||
public void transform(String transformName, String sourceMimetype, String targetMimetype,
|
||||
Map<String, String> transformOptions, File sourceFile, File targetFile)
|
||||
throws Exception
|
||||
{
|
||||
final boolean includeContents = parseBoolean(
|
||||
transformOptions.getOrDefault(RequestParamMap.INCLUDE_CONTENTS, "false"));
|
||||
final boolean notExtractBookmarksText = parseBoolean(
|
||||
transformOptions.getOrDefault(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT, String.valueOf(notExtractBookmarksTextDefault)));
|
||||
final String targetEncoding = transformOptions.getOrDefault("targetEncoding", "UTF-8");
|
||||
if (transformOptions.get(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT) == null && notExtractBookmarksTextDefault)
|
||||
{
|
||||
logger.trace("notExtractBookmarksText default value has been overridden to {}", notExtractBookmarksTextDefault);
|
||||
}
|
||||
call(sourceFile, targetFile, transformName,
|
||||
includeContents ? Tika.INCLUDE_CONTENTS : null,
|
||||
notExtractBookmarksText ? Tika.NOT_EXTRACT_BOOKMARKS_TEXT : null,
|
||||
Tika.TARGET_MIMETYPE + targetMimetype, Tika.TARGET_ENCODING + targetEncoding);
|
||||
}
|
||||
|
||||
void call(File sourceFile, File targetFile, String... args)
|
||||
{
|
||||
Parser parser = getParser();
|
||||
DocumentSelector documentSelector = getDocumentSelector();
|
||||
args = buildArgs(sourceFile, targetFile, args);
|
||||
tika.transform(parser, documentSelector, args);
|
||||
}
|
||||
|
||||
private static String[] buildArgs(File sourceFile, File targetFile, String[] args)
|
||||
{
|
||||
ArrayList<String> methodArgs = new ArrayList<>(args.length + 2);
|
||||
StringJoiner sj = new StringJoiner(" ");
|
||||
for (String arg : args)
|
||||
{
|
||||
addArg(methodArgs, sj, arg);
|
||||
}
|
||||
|
||||
addFileArg(methodArgs, sj, sourceFile);
|
||||
addFileArg(methodArgs, sj, targetFile);
|
||||
|
||||
LogEntry.setOptions(sj.toString());
|
||||
|
||||
return methodArgs.toArray(new String[0]);
|
||||
}
|
||||
|
||||
private static void addArg(ArrayList<String> methodArgs, StringJoiner sj, String arg)
|
||||
{
|
||||
if (arg != null)
|
||||
{
|
||||
sj.add(arg);
|
||||
methodArgs.add(arg);
|
||||
}
|
||||
}
|
||||
|
||||
private static void addFileArg(ArrayList<String> methodArgs, StringJoiner sj, File arg)
|
||||
{
|
||||
if (arg != null)
|
||||
{
|
||||
String path = arg.getAbsolutePath();
|
||||
int i = path.lastIndexOf('.');
|
||||
String ext = i == -1 ? "???" : path.substring(i + 1);
|
||||
sj.add(ext);
|
||||
methodArgs.add(path);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class OOXMLTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.ooXmlParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class OfficeTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.officeParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class OutlookMsgTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.officeParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class PdfBoxTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.pdfParser;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DocumentSelector getDocumentSelector()
|
||||
{
|
||||
return tika.pdfBoxEmbededDocumentSelector;
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class PoiTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.tikaOfficeDetectParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class TextMiningTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.officeParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,446 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.alfresco.transform.tika.parsers.TikaOfficeDetectParser;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.EmptyParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||
import org.apache.tika.parser.pkg.PackageParser;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.apache.tika.sax.ExpandedTitleContentHandler;
|
||||
import org.slf4j.Logger;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.TransformerConfigurationException;
|
||||
import javax.xml.transform.sax.SAXTransformerFactory;
|
||||
import javax.xml.transform.sax.TransformerHandler;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_PNG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_TIFF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_CSV;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
|
||||
|
||||
@Component
|
||||
public class Tika
|
||||
{
|
||||
public static final String ARCHIVE = "Archive";
|
||||
public static final String OUTLOOK_MSG = "OutlookMsg";
|
||||
public static final String PDF_BOX = "PdfBox";
|
||||
public static final String OFFICE = "Office";
|
||||
public static final String POI = "Poi";
|
||||
public static final String OOXML = "OOXML";
|
||||
public static final String TIKA_AUTO = "TikaAuto";
|
||||
public static final String TEXT_MINING = "TextMining";
|
||||
|
||||
public static final String TARGET_MIMETYPE = "--targetMimetype=";
|
||||
public static final String TARGET_ENCODING = "--targetEncoding=";
|
||||
public static final String INCLUDE_CONTENTS = "--includeContents";
|
||||
public static final String NOT_EXTRACT_BOOKMARKS_TEXT = "--notExtractBookmarksText";
|
||||
|
||||
public static final String CSV = "csv";
|
||||
public static final String DOC = "doc";
|
||||
public static final String DOCX = "docx";
|
||||
public static final String HTML = "html";
|
||||
public static final String MSG = "msg";
|
||||
public static final String PDF = "pdf";
|
||||
public static final String PPTX = "pptx";
|
||||
public static final String TXT = "txt";
|
||||
public static final String XHTML = "xhtml";
|
||||
public static final String XSLX = "xslx";
|
||||
public static final String XML = "xml";
|
||||
public static final String ZIP = "zip";
|
||||
|
||||
public static final Parser packageParser = new PackageParser();
|
||||
public static final Parser pdfParser = new PDFParser();
|
||||
public static final Parser officeParser = new OfficeParser();
|
||||
public final Parser autoDetectParser;
|
||||
public static final Parser ooXmlParser = new OOXMLParser();
|
||||
public static final Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
|
||||
public final PDFParserConfig pdfParserConfig = new PDFParserConfig();
|
||||
|
||||
public static final DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
|
||||
{
|
||||
private final List<String> disabledMediaTypes = ImmutableList.of(MIMETYPE_IMAGE_JPEG,
|
||||
MIMETYPE_IMAGE_TIFF, MIMETYPE_IMAGE_PNG);
|
||||
|
||||
@Override
|
||||
public boolean select(Metadata metadata)
|
||||
{
|
||||
String contentType = metadata.get(Metadata.CONTENT_TYPE);
|
||||
if (contentType == null || contentType.equals("") || disabledMediaTypes == null)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return !disabledMediaTypes.contains(contentType);
|
||||
}
|
||||
};
|
||||
|
||||
public Tika() throws TikaException, IOException, SAXException
|
||||
{
|
||||
TikaConfig tikaConfig = readTikaConfig();
|
||||
autoDetectParser = new AutoDetectParser(tikaConfig);
|
||||
}
|
||||
|
||||
public static TikaConfig readTikaConfig(Logger logger)
|
||||
{
|
||||
try
|
||||
{
|
||||
return readTikaConfig();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.error("Failed to read tika-config.xml", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static TikaConfig readTikaConfig() throws TikaException, IOException, SAXException
|
||||
{
|
||||
ClassLoader classLoader = Tika.class.getClassLoader();
|
||||
URL tikaConfigXml = classLoader.getResource("tika-config.xml");
|
||||
return new TikaConfig(tikaConfigXml);
|
||||
}
|
||||
|
||||
// Extracts parameters form args
|
||||
public void transform(Parser parser, DocumentSelector documentSelector, String[] args)
|
||||
{
|
||||
String transform = null;
|
||||
String targetMimetype = null;
|
||||
String targetEncoding = null;
|
||||
String sourceFilename = null;
|
||||
String targetFilename = null;
|
||||
Boolean includeContents = null;
|
||||
Boolean notExtractBookmarksText = null;
|
||||
|
||||
for (String arg : args)
|
||||
{
|
||||
if (arg.startsWith("--"))
|
||||
{
|
||||
if (INCLUDE_CONTENTS.startsWith(arg))
|
||||
{
|
||||
getValue(arg, false, includeContents, INCLUDE_CONTENTS);
|
||||
includeContents = true;
|
||||
}
|
||||
else if (arg.startsWith(TARGET_ENCODING))
|
||||
{
|
||||
targetEncoding = getValue(arg, true, targetEncoding, TARGET_ENCODING);
|
||||
}
|
||||
else if (arg.startsWith(TARGET_MIMETYPE))
|
||||
{
|
||||
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
|
||||
}
|
||||
else if (arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT))
|
||||
{
|
||||
getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT);
|
||||
notExtractBookmarksText = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected argument " + arg);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (transform == null)
|
||||
{
|
||||
transform = arg;
|
||||
}
|
||||
else if (sourceFilename == null)
|
||||
{
|
||||
sourceFilename = arg;
|
||||
}
|
||||
else if (targetFilename == null)
|
||||
{
|
||||
targetFilename = arg;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected argument " + arg);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (targetFilename == null)
|
||||
{
|
||||
throw new IllegalArgumentException("Missing arguments");
|
||||
}
|
||||
includeContents = includeContents == null ? false : includeContents;
|
||||
notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText;
|
||||
|
||||
transform(parser, documentSelector, includeContents, notExtractBookmarksText, sourceFilename,
|
||||
targetFilename, targetMimetype, targetEncoding);
|
||||
}
|
||||
|
||||
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
|
||||
{
|
||||
if (value != null)
|
||||
{
|
||||
throw new IllegalArgumentException("Duplicate " + optionName);
|
||||
}
|
||||
String stringValue = arg.substring(optionName.length()).trim();
|
||||
if (!valueExpected && stringValue.length() > 0)
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected value with " + optionName);
|
||||
}
|
||||
if (valueExpected && stringValue.length() == 0)
|
||||
{
|
||||
throw new IllegalArgumentException("Expected value with " + optionName);
|
||||
}
|
||||
return stringValue;
|
||||
}
|
||||
|
||||
private void transform(Parser parser, DocumentSelector documentSelector,
|
||||
Boolean includeContents,
|
||||
Boolean notExtractBookmarksText,
|
||||
String sourceFilename,
|
||||
String targetFilename, String targetMimetype, String targetEncoding)
|
||||
{
|
||||
|
||||
try (InputStream is = new BufferedInputStream(new FileInputStream(sourceFilename));
|
||||
OutputStream os = new FileOutputStream(targetFilename);
|
||||
Writer ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding)))
|
||||
{
|
||||
Metadata metadata = new Metadata();
|
||||
ParseContext context = buildParseContext(documentSelector, includeContents,
|
||||
notExtractBookmarksText);
|
||||
ContentHandler handler = getContentHandler(targetMimetype, ow);
|
||||
|
||||
parser.parse(is, handler, metadata, context);
|
||||
}
|
||||
catch (SAXException | TikaException | IOException e)
|
||||
{
|
||||
throw new IllegalStateException(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private ContentHandler getContentHandler(String targetMimetype, Writer output)
|
||||
{
|
||||
try
|
||||
{
|
||||
ContentHandler handler;
|
||||
if (MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
||||
{
|
||||
handler = new BodyContentHandler(output);
|
||||
}
|
||||
else
|
||||
{
|
||||
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
|
||||
TransformerHandler transformerHandler;
|
||||
transformerHandler = factory.newTransformerHandler();
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
transformerHandler.setResult(new StreamResult(output));
|
||||
handler = transformerHandler;
|
||||
|
||||
if (MIMETYPE_HTML.equals(targetMimetype))
|
||||
{
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, HTML);
|
||||
return new ExpandedTitleContentHandler(transformerHandler);
|
||||
}
|
||||
else if (MIMETYPE_XHTML.equals(targetMimetype) ||
|
||||
MIMETYPE_XML.equals(targetMimetype))
|
||||
{
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, XML);
|
||||
}
|
||||
else if (MIMETYPE_TEXT_CSV.equals(targetMimetype))
|
||||
{
|
||||
handler = new CsvContentHandler(output);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Invalid target mimetype " + targetMimetype);
|
||||
}
|
||||
}
|
||||
return handler;
|
||||
}
|
||||
catch (TransformerConfigurationException e)
|
||||
{
|
||||
throw new IllegalStateException(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A wrapper around the normal Tika BodyContentHandler for CSV rather encoding than tab separated.
|
||||
*/
|
||||
protected static class CsvContentHandler extends BodyContentHandler
|
||||
{
|
||||
private static final char[] comma = new char[]{','};
|
||||
private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+");
|
||||
|
||||
private boolean inCell = false;
|
||||
private boolean needsComma = false;
|
||||
|
||||
protected CsvContentHandler(Writer output)
|
||||
{
|
||||
super(output);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void ignorableWhitespace(char[] ch, int start, int length)
|
||||
throws SAXException
|
||||
{
|
||||
if (length == 1 && ch[0] == '\t')
|
||||
{
|
||||
// Ignore tabs, as they mess up the CSV output
|
||||
}
|
||||
else
|
||||
{
|
||||
super.ignorableWhitespace(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length)
|
||||
throws SAXException
|
||||
{
|
||||
if (inCell)
|
||||
{
|
||||
StringBuffer t = new StringBuffer(new String(ch, start, length));
|
||||
|
||||
// Quote if not all numbers
|
||||
if (all_nums.matcher(t).matches())
|
||||
{
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = t.length() - 1; i >= 0; i--)
|
||||
{
|
||||
if (t.charAt(i) == '\"')
|
||||
{
|
||||
// Double up double quotes
|
||||
t.insert(i, '\"');
|
||||
i--;
|
||||
}
|
||||
}
|
||||
t.insert(0, '\"');
|
||||
t.append('\"');
|
||||
char[] c = t.toString().toCharArray();
|
||||
super.characters(c, 0, c.length);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String name,
|
||||
Attributes atts) throws SAXException
|
||||
{
|
||||
if (localName.equals("td"))
|
||||
{
|
||||
inCell = true;
|
||||
if (needsComma)
|
||||
{
|
||||
super.characters(comma, 0, 1);
|
||||
needsComma = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
super.startElement(uri, localName, name, atts);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String name)
|
||||
throws SAXException
|
||||
{
|
||||
if (localName.equals("td"))
|
||||
{
|
||||
needsComma = true;
|
||||
inCell = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (localName.equals("tr"))
|
||||
{
|
||||
needsComma = false;
|
||||
}
|
||||
super.endElement(uri, localName, name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private ParseContext buildParseContext(DocumentSelector documentSelector,
|
||||
Boolean includeContents, Boolean notExtractBookmarksText)
|
||||
{
|
||||
ParseContext context = new ParseContext();
|
||||
|
||||
if (documentSelector != null)
|
||||
{
|
||||
context.set(DocumentSelector.class, documentSelector);
|
||||
}
|
||||
|
||||
if (notExtractBookmarksText.equals(true))
|
||||
{
|
||||
pdfParserConfig.setExtractBookmarksText(false);
|
||||
// pdfParserConfig is set to override default settings
|
||||
context.set(PDFParserConfig.class, pdfParserConfig);
|
||||
}
|
||||
|
||||
// If Archive transform
|
||||
if (includeContents != null)
|
||||
{
|
||||
context.set(Parser.class, includeContents ? autoDetectParser : new EmptyParser());
|
||||
}
|
||||
|
||||
return context;
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class TikaAutoTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.autoDetectParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,12 @@
|
||||
#
|
||||
# DWGMetadataExtracter - default mapping
|
||||
#
|
||||
# author: Nick Burch
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
description=cm:description
|
@@ -0,0 +1,141 @@
|
||||
#
|
||||
# IPTCMetadataExtracter - default mapping
|
||||
#
|
||||
# author: David Edwards
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# IPTC
|
||||
namespace.prefix.iptcxmp=http://www.alfresco.org/model/content/metadata/IPTCXMP/1.0
|
||||
namespace.prefix.dc=http://purl.org/dc/elements/1.1/
|
||||
namespace.prefix.Iptc4xmpCore=http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/
|
||||
namespace.prefix.Iptc4xmpExt=http://iptc.org/std/Iptc4xmpExt/2008-02-29/
|
||||
namespace.prefix.photoshop=http://ns.adobe.com/photoshop/1.0/
|
||||
namespace.prefix.plus=http://ns.useplus.org/ldf/xmp/1.0/
|
||||
namespace.prefix.xmpRights=http://ns.adobe.com/xap/1.0/rights/
|
||||
namespace.prefix.stDim=http://ns.adobe.com/xap/1.0/sType/Dimensions
|
||||
|
||||
# Exif
|
||||
namespace.prefix.exif=http://www.alfresco.org/model/exif/1.0
|
||||
|
||||
# Mappings from TikaAutoExtractor
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
description=cm:description
|
||||
created=cm:created
|
||||
|
||||
geo\:lat=cm:latitude
|
||||
geo\:long=cm:longitude
|
||||
|
||||
tiff\:ImageWidth=exif:pixelXDimension
|
||||
tiff\:ImageLength=exif:pixelYDimension
|
||||
tiff\:Make=exif:manufacturer
|
||||
tiff\:Model=exif:model
|
||||
tiff\:Software=exif:software
|
||||
tiff\:Orientation=exif:orientation
|
||||
tiff\:XResolution=exif:xResolution
|
||||
tiff\:YResolution=exif:yResolution
|
||||
tiff\:ResolutionUnit=exif:resolutionUnit
|
||||
exif\:Flash=exif:flash
|
||||
exif\:ExposureTime=exif:exposureTime
|
||||
exif\:FNumber=exif:fNumber
|
||||
exif\:FocalLength=exif:focalLength
|
||||
exif\:IsoSpeedRatings=exif:isoSpeedRatings
|
||||
exif\:DateTimeOriginal=exif:dateTimeOriginal
|
||||
|
||||
# IPTC Mappings
|
||||
XMP-dc\:Description=dc:description
|
||||
XMP-dc\:Subject=dc:subject
|
||||
XMP-dc\:Creator=dc:creator
|
||||
XMP-dc\:Rights=dc:rights
|
||||
XMP-dc\:Title=dc:title
|
||||
XMP-iptcCore\:CreatorCountry=Iptc4xmpCore:CiAdrCtry
|
||||
XMP-iptcCore\:CountryCode=Iptc4xmpCore:CountryCode
|
||||
XMP-iptcCore\:CreatorAddress=Iptc4xmpCore:CiAdrExtadr
|
||||
XMP-iptcCore\:CreatorCity=Iptc4xmpCore:CiAdrCity
|
||||
XMP-iptcCore\:CreatorPostalCode=Iptc4xmpCore:CiAdrPcode
|
||||
XMP-iptcCore\:CreatorRegion=Iptc4xmpCore:CiAdrRegion
|
||||
XMP-iptcCore\:CreatorWorkEmail=Iptc4xmpCore:CiEmailWork
|
||||
XMP-iptcCore\:CreatorWorkTelephone=Iptc4xmpCore:CiTelWork
|
||||
XMP-iptcCore\:CreatorWorkURL=Iptc4xmpCore:CiUrlWork
|
||||
XMP-iptcCore\:IntellectualGenre=Iptc4xmpCore:IntellectualGenre
|
||||
XMP-iptcCore\:Location=Iptc4xmpCore:Location
|
||||
XMP-iptcCore\:Scene=Iptc4xmpCore:Scene
|
||||
XMP-iptcCore\:SubjectCode=Iptc4xmpCore:SubjectCode
|
||||
XMP-photoshop\:AuthorsPosition=photoshop:AuthorsPosition
|
||||
XMP-photoshop\:CaptionWriter=photoshop:CaptionWriter
|
||||
XMP-photoshop\:Category=photoshop:Category
|
||||
XMP-photoshop\:City=photoshop:City
|
||||
XMP-photoshop\:Country=photoshop:Country
|
||||
XMP-photoshop\:Credit=photoshop:Credit
|
||||
XMP-photoshop\:DateCreated=photoshop:DateCreated
|
||||
XMP-photoshop\:Headline=photoshop:Headline
|
||||
XMP-photoshop\:Instructions=photoshop:Instructions
|
||||
XMP-photoshop\:Source=photoshop:Source
|
||||
XMP-photoshop\:State=photoshop:State
|
||||
XMP-photoshop\:SupplementalCategories=photoshop:SupplementalCategories
|
||||
XMP-photoshop\:TransmissionReference=photoshop:TransmissionReference
|
||||
XMP-photoshop\:Urgency=photoshop:Urgency
|
||||
XMP-xmpRights\:UsageTerms=xmpRights:UsageTerms
|
||||
|
||||
XMP-iptcExt\:AdditionalModelInformation=Iptc4xmpExt:AddlModelInfo
|
||||
XMP-iptcExt\:ArtworkCopyrightNotice=Iptc4xmpExt:AOCopyrightNotice
|
||||
XMP-iptcExt\:ArtworkCreator=Iptc4xmpExt:AOCreator
|
||||
XMP-iptcExt\:ArtworkDateCreated=Iptc4xmpExt:AODateCreated
|
||||
XMP-iptcExt\:ArtworkSource=Iptc4xmpExt:AOSource
|
||||
XMP-iptcExt\:ArtworkSourceInventoryNo=Iptc4xmpExt:AOSourceInvNo
|
||||
XMP-iptcExt\:ArtworkTitle=Iptc4xmpExt:AOTitle
|
||||
XMP-iptcExt\:ControlledVocabularyTerm=Iptc4xmpExt:CVterm
|
||||
XMP-iptcExt\:DigitalImageGUID=Iptc4xmpExt:DigImageGUID
|
||||
XMP-iptcExt\:DigitalSourceFileType=Iptc4xmpExt:DigitalSourcefileType
|
||||
XMP-iptcExt\:DigitalSourceType=Iptc4xmpExt:DigitalSourceType
|
||||
XMP-iptcExt\:Event=Iptc4xmpExt:Event
|
||||
XMP-iptcExt\:IPTCLastEdited=Iptc4xmpExt:IptcLastEdited
|
||||
XMP-iptcExt\:LocationCreatedCity=Iptc4xmpExt:LocationCreatedCity
|
||||
XMP-iptcExt\:LocationCreatedCountryCode=Iptc4xmpExt:LocationCreatedCountryCode
|
||||
XMP-iptcExt\:LocationCreatedCountryName=Iptc4xmpExt:LocationCreatedCountryName
|
||||
XMP-iptcExt\:LocationCreatedProvinceState=Iptc4xmpExt:LocationCreatedProvinceState
|
||||
XMP-iptcExt\:LocationCreatedSublocation=Iptc4xmpExt:LocationCreatedSublocation
|
||||
XMP-iptcExt\:LocationCreatedWorldRegion=Iptc4xmpExt:LocationCreatedWorldRegion
|
||||
XMP-iptcExt\:LocationShownCity=Iptc4xmpExt:LocationShownCity
|
||||
XMP-iptcExt\:LocationShownCountryCode=Iptc4xmpExt:LocationShownCountryCode
|
||||
XMP-iptcExt\:LocationShownCountryName=Iptc4xmpExt:LocationShownCountryName
|
||||
XMP-iptcExt\:LocationShownProvinceState=Iptc4xmpExt:LocationShownProvinceState
|
||||
XMP-iptcExt\:LocationShownSublocation=Iptc4xmpExt:LocationShownSublocation
|
||||
XMP-iptcExt\:LocationShownWorldRegion=Iptc4xmpExt:LocationShownWorldRegion
|
||||
XMP-iptcExt\:MaxAvailHeight=Iptc4xmpExt:MaxAvailHeight
|
||||
XMP-iptcExt\:MaxAvailWidth=Iptc4xmpExt:MaxAvailWidth
|
||||
XMP-iptcExt\:ModelAge=Iptc4xmpExt:ModelAge
|
||||
XMP-iptcExt\:OrganisationInImageCode=Iptc4xmpExt:OrganisationInImageCode
|
||||
XMP-iptcExt\:OrganisationInImageName=Iptc4xmpExt:OrganisationInImageName
|
||||
XMP-iptcExt\:PersonInImage=Iptc4xmpExt:PersonInImage
|
||||
XMP-iptcExt\:RegistryItemID=Iptc4xmpExt:RegItemId
|
||||
XMP-iptcExt\:RegistryOrganisationID=Iptc4xmpExt:RegOrgId
|
||||
XMP-plus\:CopyrightOwnerID=plus:CopyrightOwnerID
|
||||
XMP-plus\:CopyrightOwnerName=plus:CopyrightOwnerName
|
||||
XMP-plus\:ImageCreatorID=plus:ImageCreatorID
|
||||
XMP-plus\:ImageCreatorName=plus:ImageCreatorName
|
||||
XMP-plus\:ImageSupplierID=plus:ImageSupplierID
|
||||
XMP-plus\:ImageSupplierImageID=plus:ImageSupplierImageID
|
||||
XMP-plus\:ImageSupplierName=plus:ImageSupplierName
|
||||
XMP-plus\:LicensorCity=plus:LicensorCity
|
||||
XMP-plus\:LicensorCountry=plus:LicensorCountry
|
||||
XMP-plus\:LicensorEmail=plus:LicensorEmail
|
||||
XMP-plus\:LicensorExtendedAddress=plus:LicensorExtendedAddress
|
||||
XMP-plus\:LicensorID=plus:LicensorID
|
||||
XMP-plus\:LicensorName=plus:LicensorName
|
||||
XMP-plus\:LicensorPostalCode=plus:LicensorPostalCode
|
||||
XMP-plus\:LicensorRegion=plus:LicensorRegion
|
||||
XMP-plus\:LicensorStreetAddress=plus:LicensorStreetAddress
|
||||
XMP-plus\:LicensorTelephone1=plus:LicensorTelephone1
|
||||
XMP-plus\:LicensorTelephone2=plus:LicensorTelephone2
|
||||
XMP-plus\:LicensorURL=plus:LicensorURL
|
||||
XMP-plus\:MinorModelAgeDisclosure=plus:MinorModelAgeDisclosure
|
||||
XMP-plus\:ModelReleaseID=plus:ModelReleaseID
|
||||
XMP-plus\:ModelReleaseStatus=plus:ModelReleaseStatus
|
||||
XMP-plus\:PLUSVersion=plus:Version
|
||||
XMP-plus\:PropertyReleaseID=plus:PropertyReleaseID
|
||||
XMP-plus\:PropertyReleaseStatus=plus:PropertyReleaseStatus
|
||||
|
||||
stDim\:unit=stDim:unit
|
@@ -0,0 +1,30 @@
|
||||
#
|
||||
# MP3MetadataExtracter - default mapping
|
||||
#
|
||||
# author: Derek Hulley
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
|
||||
|
||||
# Core mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
description=cm:description
|
||||
created=cm:created
|
||||
|
||||
# Audio descriptive mappings
|
||||
xmpDM\:album=audio:album
|
||||
xmpDM\:artist=audio:artist
|
||||
xmpDM\:composer=audio:composer
|
||||
xmpDM\:engineer=audio:engineer
|
||||
xmpDM\:genre=audio:genre
|
||||
xmpDM\:trackNumber=audio:trackNumber
|
||||
xmpDM\:releaseDate=audio:releaseDate
|
||||
#xmpDM:logComment
|
||||
|
||||
# Audio specific mappings
|
||||
xmpDM\:audioSampleRate=audio:sampleRate
|
||||
xmpDM\:audioSampleType=audio:sampleType
|
||||
xmpDM\:audioChannelType=audio:channelType
|
||||
xmpDM\:audioCompressor=audio:compressor
|
@@ -0,0 +1,14 @@
|
||||
#
|
||||
# MailMetadataExtracter - default mapping
|
||||
#
|
||||
# author: Derek Hulley
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
sentDate=cm:sentdate
|
||||
originator=cm:originator, cm:author
|
||||
addressee=cm:addressee
|
||||
addressees=cm:addressees
|
||||
subjectLine=cm:subjectline, cm:description
|
@@ -0,0 +1,14 @@
|
||||
#
|
||||
# OfficeMetadataExtracter - default mapping
|
||||
#
|
||||
# author: Derek Hulley
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
subject=cm:description
|
||||
createDateTime=cm:created
|
||||
lastSaveDateTime=cm:modified
|
@@ -0,0 +1,21 @@
|
||||
#
|
||||
# OpenDocumentMetadataExtracter - default mapping
|
||||
#
|
||||
# author: Derek Hulley
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
creationDate=cm:created
|
||||
creator=cm:author
|
||||
date=
|
||||
description=
|
||||
generator=
|
||||
initialCreator=
|
||||
keyword=
|
||||
language=
|
||||
printDate=
|
||||
printedBy=
|
||||
subject=cm:description
|
||||
title=cm:title
|
@@ -0,0 +1,13 @@
|
||||
#
|
||||
# PdfBoxMetadataExtracter - default mapping
|
||||
#
|
||||
# author: Derek Hulley
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
subject=cm:description
|
||||
created=cm:created
|
@@ -0,0 +1,13 @@
|
||||
#
|
||||
# PoiMetadataExtracter - default mapping
|
||||
#
|
||||
# author: Neil McErlean
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
description=cm:description
|
||||
created=cm:created
|
@@ -0,0 +1,34 @@
|
||||
#
|
||||
# TikaAudioMetadataExtracter - audio mapping
|
||||
#
|
||||
# This is used to map from the Tika audio metadata onto your
|
||||
# content model. This will be used for any Audio content
|
||||
# for which an explicit extractor isn't defined
|
||||
#
|
||||
# author: Nick Burch
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
|
||||
|
||||
# Core mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
description=cm:description
|
||||
created=cm:created
|
||||
|
||||
# Audio descriptive mappings
|
||||
xmpDM\:album=audio:album
|
||||
xmpDM\:artist=audio:artist
|
||||
xmpDM\:composer=audio:composer
|
||||
xmpDM\:engineer=audio:engineer
|
||||
xmpDM\:genre=audio:genre
|
||||
xmpDM\:trackNumber=audio:trackNumber
|
||||
xmpDM\:releaseDate=audio:releaseDate
|
||||
#xmpDM:logComment
|
||||
|
||||
# Audio specific mappings
|
||||
xmpDM\:audioSampleRate=audio:sampleRate
|
||||
xmpDM\:audioSampleType=audio:sampleType
|
||||
xmpDM\:audioChannelType=audio:channelType
|
||||
xmpDM\:audioCompressor=audio:compressor
|
@@ -0,0 +1,52 @@
|
||||
#
|
||||
# TikaAutoMetadataExtracter - default mapping
|
||||
#
|
||||
# This is used to map from the Tika and standard namespaces
|
||||
# onto your content model. This will be used for any
|
||||
# content for which an explicit extractor isn't defined,
|
||||
# by using Tika's auto-selection facilities.
|
||||
#
|
||||
# author: Nick Burch
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
namespace.prefix.exif=http://www.alfresco.org/model/exif/1.0
|
||||
namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
|
||||
|
||||
# Mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
description=cm:description
|
||||
created=cm:created
|
||||
|
||||
geo\:lat=cm:latitude
|
||||
geo\:long=cm:longitude
|
||||
|
||||
tiff\:ImageWidth=exif:pixelXDimension
|
||||
tiff\:ImageLength=exif:pixelYDimension
|
||||
tiff\:Make=exif:manufacturer
|
||||
tiff\:Model=exif:model
|
||||
tiff\:Software=exif:software
|
||||
tiff\:Orientation=exif:orientation
|
||||
tiff\:XResolution=exif:xResolution
|
||||
tiff\:YResolution=exif:yResolution
|
||||
tiff\:ResolutionUnit=exif:resolutionUnit
|
||||
exif\:Flash=exif:flash
|
||||
exif\:ExposureTime=exif:exposureTime
|
||||
exif\:FNumber=exif:fNumber
|
||||
exif\:FocalLength=exif:focalLength
|
||||
exif\:IsoSpeedRatings=exif:isoSpeedRatings
|
||||
exif\:DateTimeOriginal=exif:dateTimeOriginal
|
||||
|
||||
xmpDM\:album=audio:album
|
||||
xmpDM\:artist=audio:artist
|
||||
xmpDM\:composer=audio:composer
|
||||
xmpDM\:engineer=audio:engineer
|
||||
xmpDM\:genre=audio:genre
|
||||
xmpDM\:trackNumber=audio:trackNumber
|
||||
xmpDM\:releaseDate=audio:releaseDate
|
||||
#xmpDM:logComment
|
||||
xmpDM\:audioSampleRate=audio:sampleRate
|
||||
xmpDM\:audioSampleType=audio:sampleType
|
||||
xmpDM\:audioChannelType=audio:channelType
|
||||
xmpDM\:audioCompressor=audio:compressor
|
8
engines/tika/src/main/resources/application-default.yaml
Normal file
8
engines/tika/src/main/resources/application-default.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
queue:
|
||||
engineRequestQueue: ${TRANSFORM_ENGINE_REQUEST_QUEUE:org.alfresco.transform.engine.tika.acs}
|
||||
transform:
|
||||
core:
|
||||
version: @project.version@
|
||||
tika:
|
||||
pdfBox:
|
||||
notExtractBookmarksTextDefault: ${PDFBOX_NOTEXTRACTBOOKMARKS_DEFAULT:false}
|
201
engines/tika/src/main/resources/licenses/3rd-party/Apache 2.0.txt
vendored
Normal file
201
engines/tika/src/main/resources/licenses/3rd-party/Apache 2.0.txt
vendored
Normal file
@@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
127
engines/tika/src/main/resources/licenses/3rd-party/Perl-Artistic-license.txt
vendored
Normal file
127
engines/tika/src/main/resources/licenses/3rd-party/Perl-Artistic-license.txt
vendored
Normal file
@@ -0,0 +1,127 @@
|
||||
The "Artistic License"
|
||||
|
||||
Preamble
|
||||
|
||||
The intent of this document is to state the conditions under which a
|
||||
Package may be copied, such that the Copyright Holder maintains some
|
||||
semblance of artistic control over the development of the package,
|
||||
while giving the users of the package the right to use and distribute
|
||||
the Package in a more-or-less customary fashion, plus the right to make
|
||||
reasonable modifications.
|
||||
|
||||
Definitions:
|
||||
|
||||
"Package" refers to the collection of files distributed by the
|
||||
Copyright Holder, and derivatives of that collection of files
|
||||
created through textual modification.
|
||||
|
||||
"Standard Version" refers to such a Package if it has not been
|
||||
modified, or has been modified in accordance with the wishes
|
||||
of the Copyright Holder as specified below.
|
||||
|
||||
"Copyright Holder" is whoever is named in the copyright or
|
||||
copyrights for the package.
|
||||
|
||||
"You" is you, if you're thinking about copying or distributing
|
||||
this Package.
|
||||
|
||||
"Reasonable copying fee" is whatever you can justify on the
|
||||
basis of media cost, duplication charges, time of people involved,
|
||||
and so on. (You will not be required to justify it to the
|
||||
Copyright Holder, but only to the computing community at large
|
||||
as a market that must bear the fee.)
|
||||
|
||||
"Freely Available" means that no fee is charged for the item
|
||||
itself, though there may be fees involved in handling the item.
|
||||
It also means that recipients of the item may redistribute it
|
||||
under the same conditions they received it.
|
||||
|
||||
1. You may make and give away verbatim copies of the source form of the
|
||||
Standard Version of this Package without restriction, provided that you
|
||||
duplicate all of the original copyright notices and associated disclaimers.
|
||||
|
||||
2. You may apply bug fixes, portability fixes and other modifications
|
||||
derived from the Public Domain or from the Copyright Holder. A Package
|
||||
modified in such a way shall still be considered the Standard Version.
|
||||
|
||||
3. You may otherwise modify your copy of this Package in any way, provided
|
||||
that you insert a prominent notice in each changed file stating how and
|
||||
when you changed that file, and provided that you do at least ONE of the
|
||||
following:
|
||||
|
||||
a) place your modifications in the Public Domain or otherwise make them
|
||||
Freely Available, such as by posting said modifications to Usenet or
|
||||
an equivalent medium, or placing the modifications on a major archive
|
||||
site such as uunet.uu.net, or by allowing the Copyright Holder to include
|
||||
your modifications in the Standard Version of the Package.
|
||||
|
||||
b) use the modified Package only within your corporation or organization.
|
||||
|
||||
c) rename any non-standard executables so the names do not conflict
|
||||
with standard executables, which must also be provided, and provide
|
||||
a separate manual page for each non-standard executable that clearly
|
||||
documents how it differs from the Standard Version.
|
||||
|
||||
d) make other distribution arrangements with the Copyright Holder.
|
||||
|
||||
4. You may distribute the programs of this Package in object code or
|
||||
executable form, provided that you do at least ONE of the following:
|
||||
|
||||
a) distribute a Standard Version of the executables and library files,
|
||||
together with instructions (in the manual page or equivalent) on where
|
||||
to get the Standard Version.
|
||||
|
||||
b) accompany the distribution with the machine-readable source of
|
||||
the Package with your modifications.
|
||||
|
||||
c) give non-standard executables non-standard names, and clearly
|
||||
document the differences in manual pages (or equivalent), together
|
||||
with instructions on where to get the Standard Version.
|
||||
|
||||
d) make other distribution arrangements with the Copyright Holder.
|
||||
|
||||
5. You may charge a reasonable copying fee for any distribution of this
|
||||
Package. You may charge any fee you choose for support of this
|
||||
Package. You may not charge a fee for this Package itself. However,
|
||||
you may distribute this Package in aggregate with other (possibly
|
||||
commercial) programs as part of a larger (possibly commercial) software
|
||||
distribution provided that you do not advertise this Package as a
|
||||
product of your own. You may embed this Package's interpreter within
|
||||
an executable of yours (by linking); this shall be construed as a mere
|
||||
form of aggregation, provided that the complete Standard Version of the
|
||||
interpreter is so embedded.
|
||||
|
||||
6. The scripts and library files supplied as input to or produced as
|
||||
output from the programs of this Package do not automatically fall
|
||||
under the copyright of this Package, but belong to whoever generated
|
||||
them, and may be sold commercially, and may be aggregated with this
|
||||
Package. If such scripts or library files are aggregated with this
|
||||
Package via the so-called "undump" or "unexec" methods of producing a
|
||||
binary executable image, then distribution of such an image shall
|
||||
neither be construed as a distribution of this Package nor shall it
|
||||
fall under the restrictions of Paragraphs 3 and 4, provided that you do
|
||||
not represent such an executable image as a Standard Version of this
|
||||
Package.
|
||||
|
||||
7. C subroutines (or comparably compiled subroutines in other
|
||||
languages) supplied by you and linked into this Package in order to
|
||||
emulate subroutines and variables of the language defined by this
|
||||
Package shall not be considered part of this Package, but are the
|
||||
equivalent of input as in Paragraph 6, provided these subroutines do
|
||||
not change the language in any way that would cause it to fail the
|
||||
regression tests for the language.
|
||||
|
||||
8. Aggregation of this Package with a commercial distribution is always
|
||||
permitted provided that the use of this Package is embedded; that is,
|
||||
when no overt attempt is made to make this Package's interfaces visible
|
||||
to the end user of the commercial distribution. Such use shall not be
|
||||
construed as a distribution of this Package.
|
||||
|
||||
9. The name of the Copyright Holder may not be used to endorse or promote
|
||||
products derived from this software without specific prior written permission.
|
||||
|
||||
10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
|
||||
The End
|
35
engines/tika/src/main/resources/parsers/external/config/exiftool-parser.xml
vendored
Normal file
35
engines/tika/src/main/resources/parsers/external/config/exiftool-parser.xml
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<external-parsers>
|
||||
<parser>
|
||||
<check>
|
||||
<command>exiftool -ver</command>
|
||||
<error-codes>126,127</error-codes>
|
||||
</check>
|
||||
<command>env FOO=${OUTPUT} exiftool -args -G1 -sep "|||" ${INPUT}</command>
|
||||
<mime-types>
|
||||
<mime-type>image/x-raw-hasselblad</mime-type>
|
||||
<mime-type>image/x-raw-sony</mime-type>
|
||||
<mime-type>image/x-raw-canon</mime-type>
|
||||
<mime-type>image/x-raw-adobe</mime-type>
|
||||
<mime-type>image/gif</mime-type>
|
||||
<mime-type>image/jp2</mime-type>
|
||||
<mime-type>image/jpeg</mime-type>
|
||||
<mime-type>image/x-raw-kodak</mime-type>
|
||||
<mime-type>image/x-raw-minolta</mime-type>
|
||||
<mime-type>image/x-raw-nikon</mime-type>
|
||||
<mime-type>image/x-raw-olympus</mime-type>
|
||||
<mime-type>image/x-raw-pentax</mime-type>
|
||||
<mime-type>image/png</mime-type>
|
||||
<mime-type>image/x-raw-fuji</mime-type>
|
||||
<mime-type>image/x-raw-panasonic</mime-type>
|
||||
<mime-type>image/tiff</mime-type>
|
||||
<mime-type>image/webp</mime-type>
|
||||
</mime-types>
|
||||
<metadata>
|
||||
<!-- Default output-->
|
||||
<match>\s*([A-Za-z0-9/ \(\)]+\S{1})\s+:\s+([A-Za-z0-9\(\)\[\] \:\-\.]+)\s*</match>
|
||||
<!-- args format-->
|
||||
<match>^-([\S]+)\=(.*)</match>
|
||||
</metadata>
|
||||
</parser>
|
||||
</external-parsers>
|
BIN
engines/tika/src/main/resources/quick.pdf
Normal file
BIN
engines/tika/src/main/resources/quick.pdf
Normal file
Binary file not shown.
28
engines/tika/src/main/resources/templates/transformForm.html
Normal file
28
engines/tika/src/main/resources/templates/transformForm.html
Normal file
@@ -0,0 +1,28 @@
|
||||
<html xmlns:th="http://www.thymeleaf.org">
|
||||
<body>
|
||||
|
||||
<div>
|
||||
<h2>Tika Test Transformations</h2>
|
||||
<form method="POST" enctype="multipart/form-data" action="/transform">
|
||||
<table>
|
||||
<tr><td><div style="text-align:right">file *</div></td><td><input type="file" name="file" /></td></tr>
|
||||
<tr><td><div style="text-align:right">Direct Url</div></td><td><input type="text" name="directAccessUrl"/></td></tr>
|
||||
<tr><td><div style="text-align:right">sourceMimetype *</div></td><td><input type="text" name="sourceMimetype" value="application/msword" /></td></tr>
|
||||
<tr><td><div style="text-align:right">targetExtension *</div></td><td><input type="text" name="targetExtension" value="txt" /></td></tr>
|
||||
<tr><td><div style="text-align:right">targetMimetype *</div></td><td><input type="text" name="targetMimetype" value="text/plain" /></td></tr>
|
||||
<tr><td><div style="text-align:right">targetEncoding *</div></td><td><input type="text" name="targetEncoding" value="UTF-8" /></td></tr>
|
||||
<tr><td><div style="text-align:right">includeContents (archive) *</div></td><td><input type="checkbox" name="includeContents" value="true" /></td></tr>
|
||||
<tr><td><div style="text-align:right">timeout</div></td><td><input type="text" name="timeout" value="" /></td></tr>
|
||||
<tr><td><div style="text-align:right">testDelay</div></td><td><input type="text" name="testDelay" value="" /></td></tr>
|
||||
<tr><td><div style="text-align:right">notExtractBookmarksText</div></td><td><input type="checkbox" name="notExtractBookmarksText" value="true" /></td></tr>
|
||||
<tr><td></td><td><input type="submit" value="Transform" /></td></tr>
|
||||
</table>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<a href="/log">Log entries</a>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
15
engines/tika/src/main/resources/tika-config.xml
Normal file
15
engines/tika/src/main/resources/tika-config.xml
Normal file
@@ -0,0 +1,15 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<properties>
|
||||
<!-- This property, when set, will hide the start up warnings of tika for libraries are missing. -->
|
||||
<!-- See https://issues.apache.org/jira/browse/TIKA-2490 -->
|
||||
<service-loader initializableProblemHandler="ignore"/>
|
||||
|
||||
<parsers>
|
||||
<!-- ATS-816: Use the PackageParser for application/vnd.apple.keynote.13 as that was used in tika-1.21-20190624-alfresco-patched -->
|
||||
<parser class="org.apache.tika.parser.pkg.PackageParser">
|
||||
<mime>application/vnd.apple.keynote.13</mime>
|
||||
</parser>
|
||||
<!-- Default parser needs to be included if the PackageParser parser is specified here, otherwise just the PackageParser is added-->
|
||||
<parser class="org.apache.tika.parser.DefaultParser"/>
|
||||
</parsers>
|
||||
</properties>
|
1011
engines/tika/src/main/resources/tika_engine_config.json
Normal file
1011
engines/tika/src/main/resources/tika_engine_config.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,647 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import org.alfresco.transform.base.AbstractTransformControllerTest;
|
||||
import org.alfresco.transform.base.TransformController;
|
||||
import org.alfresco.transform.base.executors.RuntimeExec;
|
||||
import org.alfresco.transform.base.model.FileRefEntity;
|
||||
import org.alfresco.transform.base.model.FileRefResponse;
|
||||
import org.alfresco.transform.base.probes.ProbeTestTransform;
|
||||
import org.alfresco.transform.client.model.TransformReply;
|
||||
import org.alfresco.transform.client.model.TransformRequest;
|
||||
import org.apache.poi.ooxml.POIXMLProperties;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.stubbing.Answer;
|
||||
import org.springframework.boot.test.autoconfigure.web.servlet.WebMvcTest;
|
||||
import org.springframework.core.io.FileSystemResource;
|
||||
import org.springframework.core.io.Resource;
|
||||
import org.springframework.http.HttpHeaders;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.mock.web.MockMultipartFile;
|
||||
import org.springframework.test.util.ReflectionTestUtils;
|
||||
import org.springframework.test.web.servlet.MvcResult;
|
||||
import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilder;
|
||||
import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
|
||||
import org.springframework.test.web.servlet.result.MockMvcResultMatchers;
|
||||
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
import static java.nio.file.Files.readAllBytes;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_METADATA_EMBED;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_PRESENTATION;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_SPREADSHEET;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_WORDPROCESSING;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OUTLOOK_MSG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_PDF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_CSV;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_WORD;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_ZIP;
|
||||
import static org.alfresco.transform.common.RequestParamMap.ENDPOINT_TRANSFORM;
|
||||
import static org.alfresco.transform.common.RequestParamMap.INCLUDE_CONTENTS;
|
||||
import static org.alfresco.transform.common.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.ARCHIVE;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.CSV;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.DOC;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.DOCX;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.HTML;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.MSG;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.OFFICE;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.OOXML;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.OUTLOOK_MSG;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.PDF;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.PDF_BOX;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.POI;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.PPTX;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TEXT_MINING;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TIKA_AUTO;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TXT;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.XHTML;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.XML;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.XSLX;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.ZIP;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.ArgumentMatchers.anyLong;
|
||||
import static org.mockito.Mockito.when;
|
||||
import static org.springframework.http.HttpHeaders.ACCEPT;
|
||||
import static org.springframework.http.HttpHeaders.CONTENT_DISPOSITION;
|
||||
import static org.springframework.http.HttpHeaders.CONTENT_TYPE;
|
||||
import static org.springframework.http.HttpStatus.CREATED;
|
||||
import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
|
||||
import static org.springframework.http.HttpStatus.OK;
|
||||
import static org.springframework.http.MediaType.APPLICATION_JSON_VALUE;
|
||||
import static org.springframework.http.MediaType.APPLICATION_PDF_VALUE;
|
||||
import static org.springframework.http.MediaType.TEXT_PLAIN_VALUE;
|
||||
import static org.springframework.util.StringUtils.getFilenameExtension;
|
||||
|
||||
/**
|
||||
* Test the TikaController without a server.
|
||||
* Super class includes tests for the TransformController.
|
||||
*/
|
||||
@WebMvcTest()
|
||||
public class TikaControllerTest extends AbstractTransformControllerTest
|
||||
{
|
||||
private static final String ENGINE_CONFIG_NAME = "tika_engine_config.json";
|
||||
private static final String EXPECTED_XHTML_CONTENT_CONTAINS = "<p>The quick brown fox jumps over the lazy dog</p>";
|
||||
private static final String EXPECTED_TEXT_CONTENT_CONTAINS = "The quick brown fox jumps over the lazy dog";
|
||||
private static final String EXPECTED_MSG_CONTENT_CONTAINS = "Recipients\n" +
|
||||
"\tmark.rogers@alfresco.com; speedy@quick.com; mrquick@nowhere.com\n" +
|
||||
"\n" +
|
||||
"The quick brown fox jumps over the lazy dogs";
|
||||
private static final String EXPECTED_CSV_CONTENT_CONTAINS = "\"The\",\"quick\",\"brown\",\"fox\"";
|
||||
|
||||
@Mock
|
||||
private RuntimeExec.ExecutionResult mockExecutionResult;
|
||||
|
||||
@Mock
|
||||
private RuntimeExec mockTransformCommand;
|
||||
|
||||
@Mock
|
||||
private RuntimeExec mockCheckCommand;
|
||||
|
||||
private String targetEncoding = "UTF-8";
|
||||
private String targetMimetype = MIMETYPE_TEXT_PLAIN;
|
||||
|
||||
@BeforeEach
|
||||
public void before()
|
||||
{
|
||||
sourceExtension = "pdf";
|
||||
targetExtension = "txt";
|
||||
sourceMimetype = MIMETYPE_PDF;
|
||||
targetMimetype = MIMETYPE_TEXT_PLAIN;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getEngineConfigName()
|
||||
{
|
||||
return ENGINE_CONFIG_NAME;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void mockTransformCommand(String sourceExtension,
|
||||
String targetExtension, String sourceMimetype,
|
||||
boolean readTargetFileBytes) throws IOException
|
||||
{
|
||||
this.sourceExtension = sourceExtension;
|
||||
this.targetExtension = targetExtension;
|
||||
this.sourceMimetype = sourceMimetype;
|
||||
|
||||
expectedOptions = null;
|
||||
expectedSourceSuffix = null;
|
||||
expectedSourceFileBytes = readTestFile(sourceExtension);
|
||||
expectedTargetFileBytes = readTargetFileBytes ? readTestFile(targetExtension) : null;
|
||||
sourceFile = new MockMultipartFile("file", "quick." + sourceExtension, sourceMimetype,
|
||||
expectedSourceFileBytes);
|
||||
|
||||
when(mockTransformCommand.execute(any(), anyLong())).thenAnswer(
|
||||
(Answer<RuntimeExec.ExecutionResult>) invocation -> {
|
||||
Map<String, String> actualProperties = invocation.getArgument(0);
|
||||
assertEquals(3, actualProperties.size(),"There should be 3 properties");
|
||||
|
||||
String actualOptions = actualProperties.get("options");
|
||||
String actualSource = actualProperties.get("source");
|
||||
String actualTarget = actualProperties.get("target");
|
||||
String actualTargetExtension = getFilenameExtension(actualTarget);
|
||||
|
||||
assertNotNull(actualSource);
|
||||
assertNotNull(actualTarget);
|
||||
if (expectedSourceSuffix != null)
|
||||
{
|
||||
assertTrue(actualSource.endsWith(expectedSourceSuffix),
|
||||
"The source file \"" + actualSource + "\" should have ended in \"" + expectedSourceSuffix + "\"");
|
||||
actualSource = actualSource.substring(0,
|
||||
actualSource.length() - expectedSourceSuffix.length());
|
||||
}
|
||||
|
||||
assertNotNull(actualOptions);
|
||||
if (expectedOptions != null)
|
||||
{
|
||||
Assertions.assertEquals(expectedOptions, actualOptions, "expectedOptions");
|
||||
}
|
||||
|
||||
Long actualTimeout = invocation.getArgument(1);
|
||||
assertNotNull(actualTimeout);
|
||||
if (expectedTimeout != null)
|
||||
{
|
||||
Assertions.assertEquals(expectedTimeout, actualTimeout, "expectedTimeout");
|
||||
}
|
||||
|
||||
// Copy a test file into the target file location if it exists
|
||||
int i = actualTarget.lastIndexOf('_');
|
||||
if (i >= 0)
|
||||
{
|
||||
String testFilename = actualTarget.substring(i + 1);
|
||||
File testFile = getTestFile(testFilename, false);
|
||||
File targetFile = new File(actualTarget);
|
||||
generateTargetFileFromResourceFile(actualTargetExtension, testFile,
|
||||
targetFile);
|
||||
}
|
||||
|
||||
// Check the supplied source file has not been changed.
|
||||
byte[] actualSourceFileBytes = readAllBytes(new File(actualSource).toPath());
|
||||
Assertions.assertArrayEquals(expectedSourceFileBytes, actualSourceFileBytes,
|
||||
"Source file is not the same");
|
||||
|
||||
return mockExecutionResult;
|
||||
});
|
||||
|
||||
when(mockExecutionResult.getExitValue()).thenReturn(0);
|
||||
when(mockExecutionResult.getStdErr()).thenReturn("STDERROR");
|
||||
when(mockExecutionResult.getStdOut()).thenReturn("STDOUT");
|
||||
}
|
||||
|
||||
private void transform(String transform, String sourceExtension, String targetExtension,
|
||||
String sourceMimetype, String targetMimetype,
|
||||
Boolean includeContents, String expectedContentContains) throws Exception
|
||||
{
|
||||
// We don't use targetFileBytes as some of the transforms contain different date text based on the os being used.
|
||||
mockTransformCommand(sourceExtension, targetExtension, sourceMimetype, false);
|
||||
this.targetMimetype = targetMimetype;
|
||||
|
||||
System.out.println("Test " + transform + " " + sourceExtension + " to " + targetExtension);
|
||||
MockHttpServletRequestBuilder requestBuilder = includeContents == null
|
||||
? mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile,
|
||||
"targetExtension", this.targetExtension)
|
||||
: mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile,
|
||||
"targetExtension", this.targetExtension, INCLUDE_CONTENTS, includeContents.toString());
|
||||
MvcResult result = mockMvc.perform(requestBuilder)
|
||||
.andExpect(MockMvcResultMatchers.status().is(OK.value()))
|
||||
.andExpect(MockMvcResultMatchers.header().string("Content-Disposition",
|
||||
"attachment; filename*= UTF-8''quick." + this.targetExtension)).
|
||||
andReturn();
|
||||
String content = result.getResponse().getContentAsString();
|
||||
assertTrue(content.contains(expectedContentContains),
|
||||
"The content did not include \"" + expectedContentContains);
|
||||
}
|
||||
|
||||
@Override
|
||||
// Add extra required parameters to the request.
|
||||
protected MockHttpServletRequestBuilder mockMvcRequest(String url, MockMultipartFile sourceFile,
|
||||
String... params)
|
||||
{
|
||||
return super.mockMvcRequest(url, sourceFile, params)
|
||||
.param("targetEncoding", targetEncoding)
|
||||
.param("targetMimetype", targetMimetype)
|
||||
.param("sourceMimetype", sourceMimetype);
|
||||
}
|
||||
|
||||
@Mock
|
||||
HttpServletRequest httpServletRequest;
|
||||
|
||||
@Test
|
||||
public void testImmutableEmptyMap()
|
||||
{
|
||||
// See ACS-373
|
||||
TransformController controller = getController();
|
||||
ProbeTestTransform probeTestTransform = getProbeTestTransform();
|
||||
ReflectionTestUtils.setField(probeTestTransform, "livenessTransformEnabled", true);
|
||||
probeTestTransform.doTransformOrNothing(httpServletRequest, true, controller);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void simpleTransformTest() throws Exception
|
||||
{
|
||||
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.simpleTransformTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void testDelayTest() throws Exception
|
||||
{
|
||||
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.testDelayTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void noTargetFileTest()
|
||||
{
|
||||
// Ignore the test in super class as the Tika transforms are real rather than mocked up.
|
||||
// It is the mock that returns a zero length file for other transformers, when we supply an invalid targetExtension.
|
||||
}
|
||||
|
||||
// --- Super class tests (need modified setup) ---
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void dotDotSourceFilenameTest() throws Exception
|
||||
{
|
||||
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.dotDotSourceFilenameTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void noExtensionSourceFilenameTest() throws Exception
|
||||
{
|
||||
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.noExtensionSourceFilenameTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void badSourceFilenameTest() throws Exception
|
||||
{
|
||||
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.badSourceFilenameTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void blankSourceFilenameTest() throws Exception
|
||||
{
|
||||
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.blankSourceFilenameTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void noTargetExtensionTest() throws Exception
|
||||
{
|
||||
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.noTargetExtensionTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void calculateMaxTime() throws Exception
|
||||
{
|
||||
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.calculateMaxTime();
|
||||
}
|
||||
|
||||
// --- General Tika tests ---
|
||||
|
||||
@Test
|
||||
public void badEncodingTest() throws Exception
|
||||
{
|
||||
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
|
||||
targetEncoding = "rubbish";
|
||||
mockMvc.perform(
|
||||
mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile, "targetExtension", targetExtension))
|
||||
.andExpect(MockMvcResultMatchers.status().is(INTERNAL_SERVER_ERROR.value()));
|
||||
}
|
||||
|
||||
// --- Archive ---
|
||||
|
||||
@Test
|
||||
public void zipToTextArchiveTest() throws Exception
|
||||
{
|
||||
transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN, false,
|
||||
"quick.html\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.pdf\n" +
|
||||
"\n" +
|
||||
"\n");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void zipToTextIncludeArchiveTest() throws Exception
|
||||
{
|
||||
transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN, true,
|
||||
"quick.html\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"The quick brown fox jumps over the lazy dog\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.pdf\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"The quick brown fox jumps over the lazy dog" +
|
||||
"\n" +
|
||||
"\n");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void zipToTextExcludeArchiveTest() throws Exception
|
||||
{
|
||||
transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN,
|
||||
false, "\n" +
|
||||
"folder/subfolder/quick.jpg\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.doc\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.html\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.pdf\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.txt\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.xml\n" +
|
||||
"\n");
|
||||
}
|
||||
|
||||
// --- OutlookMsg ---
|
||||
|
||||
@Test
|
||||
public void msgToTxtOutlookMsgTest() throws Exception
|
||||
{
|
||||
transform(OUTLOOK_MSG, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_MSG_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
// --- PdfBox ---
|
||||
|
||||
@Test
|
||||
public void pdfToTxtPdfBoxTest() throws Exception
|
||||
{
|
||||
transform(PDF_BOX, PDF, TXT, MIMETYPE_PDF, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pdfToCsvPdfBoxTest() throws Exception
|
||||
{
|
||||
transform(PDF_BOX, PDF, CSV, MIMETYPE_PDF, MIMETYPE_TEXT_CSV, null,
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS); // Yes it is just text
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pdfToXmlPdfBoxTest() throws Exception
|
||||
{
|
||||
transform(PDF_BOX, PDF, XML, MIMETYPE_PDF, MIMETYPE_XML, null,
|
||||
EXPECTED_XHTML_CONTENT_CONTAINS); // Yes it is just XHTML
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pdfToXhtmlPdfBoxTest() throws Exception
|
||||
{
|
||||
transform(PDF_BOX, PDF, XHTML, MIMETYPE_PDF, MIMETYPE_XHTML, null,
|
||||
EXPECTED_XHTML_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pdfToHtmlPdfBoxTest() throws Exception
|
||||
{
|
||||
transform(PDF_BOX, PDF, HTML, MIMETYPE_PDF, MIMETYPE_HTML, null,
|
||||
EXPECTED_XHTML_CONTENT_CONTAINS); // Yes it is just XHTML
|
||||
}
|
||||
|
||||
// --- Office ---
|
||||
|
||||
@Test
|
||||
public void msgToTxtOfficeTest() throws Exception
|
||||
{
|
||||
transform(OFFICE, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_MSG_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void docToTxtOfficeTest() throws Exception
|
||||
{
|
||||
transform(OFFICE, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
// --- Poi ---
|
||||
|
||||
@Test
|
||||
public void xslxToCsvPoiTest() throws Exception
|
||||
{
|
||||
transform(POI, XSLX, CSV, MIMETYPE_OPENXML_SPREADSHEET, MIMETYPE_TEXT_CSV, null,
|
||||
EXPECTED_CSV_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
// --- OOXML ---
|
||||
|
||||
@Test
|
||||
public void docxToTxtOoXmlTest() throws Exception
|
||||
{
|
||||
transform(OOXML, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pptxToTxtOoXmlTest() throws Exception
|
||||
{
|
||||
transform(OOXML, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
// --- TikaAuto ---
|
||||
|
||||
@Test
|
||||
public void ppxtToTxtTikaAutoTest() throws Exception
|
||||
{
|
||||
transform(TIKA_AUTO, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void doctToTxtTikaAutoTest() throws Exception
|
||||
{
|
||||
transform(TIKA_AUTO, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
// --- TextMining ---
|
||||
|
||||
@Test
|
||||
public void docToTxtTextMiningTest() throws Exception
|
||||
{
|
||||
transform(TEXT_MINING, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xlsxEmbedTest() throws Exception
|
||||
{
|
||||
mockTransformCommand(XSLX, XSLX, MIMETYPE_OPENXML_SPREADSHEET, false);
|
||||
|
||||
String metadata =
|
||||
"{\"{http://www.alfresco.org/model/content/1.0}author\":\"author1\"," +
|
||||
"\"{http://www.alfresco.org/model/content/1.0}title\":\"title1\"," +
|
||||
"\"{http://www.alfresco.org/model/content/1.0}description\":[\"desc1\",\"desc2\"]," +
|
||||
"\"{http://www.alfresco.org/model/content/1.0}created\":\"created1\"}";
|
||||
|
||||
MockHttpServletRequestBuilder requestBuilder =
|
||||
super.mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile,
|
||||
"targetExtension", XSLX,
|
||||
"metadata", metadata,
|
||||
"targetMimetype", MIMETYPE_METADATA_EMBED,
|
||||
"sourceMimetype", MIMETYPE_OPENXML_SPREADSHEET);
|
||||
|
||||
MvcResult result = mockMvc.perform(requestBuilder)
|
||||
.andExpect(MockMvcResultMatchers.status().is(OK.value()))
|
||||
.andExpect(MockMvcResultMatchers.header().string("Content-Disposition",
|
||||
"attachment; filename*= UTF-8''quick." + targetExtension)).
|
||||
andReturn();
|
||||
|
||||
byte[] bytes = result.getResponse().getContentAsByteArray();
|
||||
ByteArrayInputStream inputStream = new ByteArrayInputStream(bytes);
|
||||
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
|
||||
POIXMLProperties props = workbook.getProperties();
|
||||
POIXMLProperties.CoreProperties coreProp = props.getCoreProperties();
|
||||
POIXMLProperties.CustomProperties custProp = props.getCustomProperties();
|
||||
|
||||
assertEquals("author1", coreProp.getCreator());
|
||||
assertEquals("title1", coreProp.getTitle());
|
||||
assertEquals("desc1, desc2", coreProp.getDescription()); // multi value
|
||||
assertEquals("created1", custProp.getProperty("created").getLpwstr());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pdfToTxtExtractBookmarksTest() throws Exception
|
||||
{
|
||||
mockTransformCommand(PDF, TXT, MIMETYPE_PDF, true);
|
||||
mockMvc.perform(
|
||||
mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile, "targetExtension", targetExtension).param(
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, "true"))
|
||||
.andExpect(MockMvcResultMatchers.status().is(OK.value()))
|
||||
.andExpect(MockMvcResultMatchers.header().string("Content-Disposition",
|
||||
"attachment; filename*= UTF-8''quick." + targetExtension));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void updateTransformRequestWithSpecificOptions(TransformRequest transformRequest)
|
||||
{
|
||||
transformRequest.setSourceExtension(sourceExtension);
|
||||
transformRequest.setTargetExtension(targetExtension);
|
||||
transformRequest.setSourceMediaType(APPLICATION_PDF_VALUE);
|
||||
transformRequest.setTargetMediaType(TEXT_PLAIN_VALUE);
|
||||
transformRequest.getTransformRequestOptions().put("targetEncoding", "UTF-8");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPojoTransform() throws Exception
|
||||
{
|
||||
// Files
|
||||
String sourceFileRef = UUID.randomUUID().toString();
|
||||
File sourceFile = getTestFile("quick." + sourceExtension, true);
|
||||
String targetFileRef = UUID.randomUUID().toString();
|
||||
|
||||
TransformRequest transformRequest = createTransformRequest(sourceFileRef, sourceFile);
|
||||
|
||||
// HTTP Request
|
||||
HttpHeaders headers = new HttpHeaders();
|
||||
headers.set(CONTENT_DISPOSITION, "attachment; filename=quick." + sourceExtension);
|
||||
ResponseEntity<Resource> response = new ResponseEntity<>(new FileSystemResource(
|
||||
sourceFile), headers, OK);
|
||||
|
||||
when(alfrescoSharedFileStoreClient.retrieveFile(sourceFileRef)).thenReturn(response);
|
||||
when(alfrescoSharedFileStoreClient.saveFile(any()))
|
||||
.thenReturn(new FileRefResponse(new FileRefEntity(targetFileRef)));
|
||||
when(mockExecutionResult.getExitValue()).thenReturn(0);
|
||||
|
||||
// Update the Transformation Request with any specific params before sending it
|
||||
updateTransformRequestWithSpecificOptions(transformRequest);
|
||||
|
||||
// Serialize and call the transformer
|
||||
String tr = objectMapper.writeValueAsString(transformRequest);
|
||||
String transformationReplyAsString = mockMvc
|
||||
.perform(MockMvcRequestBuilders
|
||||
.post(ENDPOINT_TRANSFORM)
|
||||
.header(ACCEPT, APPLICATION_JSON_VALUE)
|
||||
.header(CONTENT_TYPE, APPLICATION_JSON_VALUE)
|
||||
.content(tr))
|
||||
.andExpect(MockMvcResultMatchers.status().is(CREATED.value()))
|
||||
.andReturn().getResponse().getContentAsString();
|
||||
|
||||
TransformReply transformReply = objectMapper.readValue(transformationReplyAsString,
|
||||
TransformReply.class);
|
||||
|
||||
// Assert the reply
|
||||
assertEquals(transformRequest.getRequestId(), transformReply.getRequestId());
|
||||
assertEquals(transformRequest.getClientData(), transformReply.getClientData());
|
||||
assertEquals(transformRequest.getSchema(), transformReply.getSchema());
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void httpTransformRequestUsingDirectAccessUrlTest() throws Exception
|
||||
{
|
||||
expectedTargetFileBytes = readTestFile(targetExtension);
|
||||
super.httpTransformRequestUsingDirectAccessUrlTest();
|
||||
}
|
||||
}
|
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import org.alfresco.transform.base.AbstractHttpRequestTest;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.context.SpringBootTest.WebEnvironment;
|
||||
import org.springframework.util.LinkedMultiValueMap;
|
||||
|
||||
/**
|
||||
* Tests TikaController with a server test harness.
|
||||
*/
|
||||
@SpringBootTest(webEnvironment = WebEnvironment.RANDOM_PORT)
|
||||
public class TikaHttpRequestTest extends AbstractHttpRequestTest
|
||||
{
|
||||
@Override
|
||||
protected String getTransformerName()
|
||||
{
|
||||
return "Tika";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getSourceExtension()
|
||||
{
|
||||
return "pdf";
|
||||
}
|
||||
|
||||
// Override method as Tika requires sourceMimetype
|
||||
// If not provided then sourceMimetype request parameter error will be thrown.
|
||||
@Override
|
||||
protected void assertTransformError(boolean addFile,
|
||||
String errorMessage,
|
||||
LinkedMultiValueMap<String, Object> additionalParams)
|
||||
{
|
||||
LinkedMultiValueMap<String, Object> parameters = new LinkedMultiValueMap<>();
|
||||
parameters.add("sourceMimetype", "application/pdf");
|
||||
|
||||
if (additionalParams != null)
|
||||
{
|
||||
parameters.addAll(additionalParams);
|
||||
}
|
||||
|
||||
super.assertTransformError(addFile, errorMessage, parameters);
|
||||
}
|
||||
}
|
@@ -0,0 +1,577 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_APP_DWG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OUTLOOK_MSG;
|
||||
import static org.alfresco.transform.base.TestFileInfo.testFile;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_AUDIO_MP4;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_EXCEL;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_BMP;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_GIF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_PNG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_TIFF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IWORK_KEYNOTE;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IWORK_NUMBERS;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IWORK_PAGES;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_MP3;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENDOCUMENT_FORMULA;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENDOCUMENT_GRAPHICS;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENDOCUMENT_GRAPHICS_TEMPLATE;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENDOCUMENT_PRESENTATION;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENDOCUMENT_PRESENTATION_TEMPLATE;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENDOCUMENT_SPREADSHEET;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENDOCUMENT_SPREADSHEET_TEMPLATE;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENDOCUMENT_TEXT;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENOFFICE1_WRITER;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_PRESENTATION;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_SPREADSHEET;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_WORDPROCESSING;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_PDF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_PPT;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_VIDEO_3GP;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_VIDEO_3GP2;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_VIDEO_FLV;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_VIDEO_MP4;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_VIDEO_QUICKTIME;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_VISIO;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_VORBIS;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_WORD;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_ZIP;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_RAW_RAF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_RAW_ARW;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_RAW_CR2;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_RAW_RW2;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_RAW_NEF;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.alfresco.transform.base.AbstractMetadataExtractsIT;
|
||||
import org.alfresco.transform.base.TestFileInfo;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
/**
|
||||
* Metadata integration tests in the Tika T-Engine.
|
||||
*
|
||||
* @author adavis
|
||||
* @author dedwards
|
||||
*/
|
||||
public class TikaMetadataExtractsIT extends AbstractMetadataExtractsIT
|
||||
{
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("engineTransformations")
|
||||
@Override
|
||||
public void testTransformation(TestFileInfo testFileInfo)
|
||||
{
|
||||
super.testTransformation(testFileInfo);
|
||||
}
|
||||
|
||||
private static Stream<TestFileInfo> engineTransformations()
|
||||
{
|
||||
// The following files are the ones tested in the content repository.
|
||||
// There are many more mimetypes supported by these extractors.
|
||||
|
||||
// Where a line has been commented out, the repository code tries to test it but stops because there is
|
||||
// either no quick file or the target extension has not been registered.
|
||||
|
||||
return Stream.of(
|
||||
//IPTCMetadataExtractor
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quick.jpg"),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quickIPTC-EXT.jpg"),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_JPEG, "jpg", "quickIPTC-multi-creator.jpg"),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_JPEG, "jpg", "testJPEG_IPTC_EXT.jpg"),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_GIF, "gif", "quickIPTC.gif"),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_PNG, "png", "quickIPTC.png"),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_RAW_ARW, "arw", "20140614_163822_Photogrpahy_Class.ARW"),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_RAW_CR2, "cr2", "20141227_134519_Palace.CR2"),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_RAW_RW2, "rw2", "20140629_145035_Flower.RW2"),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_RAW_NEF, "nef", "20150408_074941_Bush.NEF"),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_RAW_RAF, "raf", "20160502_190928_London_Underground.RAF"),
|
||||
|
||||
// DWGMetadataExtractor
|
||||
TestFileInfo.testFile(MIMETYPE_APP_DWG, "dwg", "quick2010CustomProps.dwg"),
|
||||
|
||||
// MailMetadataExtractor
|
||||
TestFileInfo.testFile(MIMETYPE_OUTLOOK_MSG, "msg", "quick.msg"),
|
||||
|
||||
// MP3MetadataExtractor
|
||||
TestFileInfo.testFile(MIMETYPE_MP3, "mp3", "quick.mp3"),
|
||||
|
||||
// OfficeMetadataExtractor
|
||||
TestFileInfo.testFile(MIMETYPE_WORD, "doc", "quick.doc"),
|
||||
//testFile("application/x-tika-msoffice-embedded; format=ole10_native", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_VISIO, "vsd", "quick.vsd"),
|
||||
//testFile("application/vnd.ms-project", "mpp", ""),
|
||||
//testFile("application/x-tika-msworks-spreadsheet", "", ""),
|
||||
//testFile("application/x-mspublisher", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_PPT, "ppt", "quick.ppt"),
|
||||
//testFile("application/x-tika-msoffice", "", ""),
|
||||
//testFile(MIMETYPE_VISIO_2013, "vsdx", ""),
|
||||
//testFile("application/sldworks", "", ""),
|
||||
//testFile(MIMETYPE_ENCRYPTED_OFFICE, "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_EXCEL, "xls", "quick.xls"),
|
||||
|
||||
// OpenDocumentMetadataExtractor
|
||||
//testFile("application/x-vnd.oasis.opendocument.presentation", "", ""),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_CHART, "odc", ""),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_IMAGE_TEMPLATE, "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.text-web", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.image", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_GRAPHICS_TEMPLATE, "otg", "quick.otg"),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_TEXT_WEB, "oth", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.spreadsheet-template", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_SPREADSHEET_TEMPLATE, "ots", "quick.ots"),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENOFFICE1_WRITER, "sxw", "quick.sxw"),
|
||||
//testFile("application/x-vnd.oasis.opendocument.graphics-template", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_GRAPHICS, "odg", "quick.odg"),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_SPREADSHEET, "ods", "quick.ods"),
|
||||
//testFile("application/x-vnd.oasis.opendocument.chart", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.spreadsheet", "", ""),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_IMAGE, "odi", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.text", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.text-template", "", ""),
|
||||
//testFile("application/vnd.oasis.opendocument.formula-template", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.formula", "", ""),
|
||||
//testFile("application/vnd.oasis.opendocument.image-template", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.image-template", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.presentation-template", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_PRESENTATION_TEMPLATE, "otp", "quick.otp"),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_TEXT, "odt", "quick.odt"),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_FORMULA_TEMPLATE, "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE, "ott", "quick.ott"),
|
||||
//testFile("application/vnd.oasis.opendocument.chart-template", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.chart-template", "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.formula-template", "", ""),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_DATABASE, "odb", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.text-master", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_PRESENTATION, "odp", "quick.odp"),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_CHART_TEMPLATE, "", ""),
|
||||
//testFile("application/x-vnd.oasis.opendocument.graphics", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_FORMULA, "odf", "quick.odf"),
|
||||
//testFile(MIMETYPE_OPENDOCUMENT_TEXT_MASTER, "odm", ""),
|
||||
|
||||
// PdfBoxMetadataExtractor
|
||||
TestFileInfo.testFile(MIMETYPE_PDF, "pdf", "quick.pdf"),
|
||||
//testFile(MIMETYPE_APPLICATION_ILLUSTRATOR, "ai", ""),
|
||||
|
||||
// PoiMetadataExtractor
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_TEMPLATE_MACRO, "potm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_SPREADSHEET_ADDIN_MACRO, "xlam", ""),
|
||||
//testFile(MIMETYPE_OPENXML_WORD_TEMPLATE, "dotx", ""),
|
||||
//testFile(MIMETYPE_OPENXML_SPREADSHEET_BINARY_MACRO, "xlsb", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENXML_WORDPROCESSING, "docx", "quick.docx"),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDE_MACRO, "sldm", ""),
|
||||
//testFile("application/vnd.ms-visio.drawing", "", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDESHOW_MACRO, "ppsm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_MACRO, "pptm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDE, "sldx", ""),
|
||||
//testFile(MIMETYPE_OPENXML_SPREADSHEET_MACRO, "xlsm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_WORD_TEMPLATE_MACRO, "dotm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_WORDPROCESSING_MACRO, "docm", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_ADDIN, "ppam", ""),
|
||||
//testFile(MIMETYPE_OPENXML_SPREADSHEET_TEMPLATE, "xltx", ""),
|
||||
//testFile("application/vnd.ms-xpsdocument", "", ""),
|
||||
//testFile("application/vnd.ms-visio.drawing.macroenabled.12", "", ""),
|
||||
//testFile("application/vnd.ms-visio.template.macroenabled.12", "", ""),
|
||||
//testFile("model/vnd.dwfx+xps", "", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_TEMPLATE, "potx", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENXML_PRESENTATION, "pptx", "quick.pptx"),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENXML_SPREADSHEET, "xlsx", "quick.xlsx"),
|
||||
//testFile("application/vnd.ms-visio.stencil", "", ""),
|
||||
//testFile("application/vnd.ms-visio.template", "", ""),
|
||||
//testFile(MIMETYPE_OPENXML_PRESENTATION_SLIDESHOW, "ppsx", ""),
|
||||
//testFile("application/vnd.ms-visio.stencil.macroenabled.12", "", ""),
|
||||
//testFile(MIMETYPE_OPENXML_SPREADSHEET_TEMPLATE_MACRO, "xltm", ""),
|
||||
|
||||
// TikaAudioMetadataExtractor
|
||||
TestFileInfo.testFile("video/x-m4v", "m4v", "quick.m4v"),
|
||||
//testFile("audio/x-oggflac", "", ""),
|
||||
//testFile("application/mp4", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_VORBIS, "ogg", "quick.ogg"),
|
||||
TestFileInfo.testFile(MIMETYPE_VIDEO_3GP, "3gp", "quick.3gp"),
|
||||
//testFile(MIMETYPE_FLAC, "flac", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_VIDEO_3GP2, "3g2", "quick.3g2"),
|
||||
TestFileInfo.testFile(MIMETYPE_VIDEO_QUICKTIME, "mov", "quick.mov"),
|
||||
TestFileInfo.testFile(MIMETYPE_AUDIO_MP4, "m4a", "quick.m4a"),
|
||||
TestFileInfo.testFile(MIMETYPE_VIDEO_MP4, "mp4", "quick.mp4"),
|
||||
|
||||
// TikaAutoMetadataExtractor
|
||||
|
||||
// The following <source>_metadata.json files contain null values against author and title.
|
||||
// This is not new and will be the case in the content repository, but was not tested.
|
||||
//
|
||||
// The expected ones are: txt, xml, zip, tar
|
||||
//
|
||||
// The unexpected ones are: quick.key, quick.numbers and quick.pages.
|
||||
//
|
||||
// quick.bmp, quick.gif, quick.png, quick.3g2, quick.3gp, quick.flv, quick.m4v, quick.mov & quick.mp4
|
||||
// contain one or more values, but also include nulls. Again this may be correct, a bug or just the
|
||||
// example quick file rather than a problem with the extractor.
|
||||
|
||||
//testFile("application/vnd.ms-htmlhelp", "", ""),
|
||||
//testFile(MIMETYPE_ATOM, "", ""),
|
||||
//testFile("audio/midi", "", ""),
|
||||
//testFile("application/aaigrid", "", ""),
|
||||
//testFile("application/x-bag", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_IWORK_KEYNOTE, "key", "quick.key"),
|
||||
//testFile("application/x-quattro-pro; version=9", "", ""),
|
||||
//testFile("application/x-ibooks+zip", "", ""),
|
||||
//testFile("audio/wave", "", ""),
|
||||
//testFile("application/x-midi", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_XML, "xml", "quick.xml"),
|
||||
//testFile(MIMETYPE_RSS, "rss", ""),
|
||||
//testFile("application/x-netcdf", "cdf", ""),
|
||||
//testFile("video/x-daala", "", ""),
|
||||
//testFile("application/matlab-mat", "", ""),
|
||||
//testFile("audio/aiff", "", ""),
|
||||
//testFile("application/jaxa-pal-sar", "", ""),
|
||||
//testFile("image/x-pcraster", "", ""),
|
||||
//testFile("image/arg", "", ""),
|
||||
//testFile("application/x-kro", "", ""),
|
||||
//testFile("image/x-hdf5-image", "", ""),
|
||||
//testFile("audio/speex", "", ""),
|
||||
//testFile("image/big-gif", "", ""),
|
||||
//testFile("application/zlib", "", ""),
|
||||
//testFile("application/x-cosar", "", ""),
|
||||
//testFile("application/x-ntv2", "", ""),
|
||||
//testFile("application/x-archive", "", ""),
|
||||
//testFile("application/java-archive", "jar", ""),
|
||||
//testFile("application/x-vnd.sun.xml.writer", "", ""),
|
||||
//testFile("application/x-gmt", "", ""),
|
||||
//testFile("application/x-xml", "", ""),
|
||||
//testFile("application/gzip-compressed", "", ""),
|
||||
//testFile("image/ida", "", ""),
|
||||
//testFile("text/x-groovy", "", ""),
|
||||
//testFile("image/x-emf", "", ""),
|
||||
//testFile("application/x-rar", "", ""),
|
||||
//testFile("image/sar-ceos", "", ""),
|
||||
//testFile("application/acad", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_ZIP, "zip", "quick.zip"),
|
||||
//testFile(MIMETYPE_IMAGE_PSD, "psd", ""),
|
||||
//testFile("application/x-sharedlib", "", ""),
|
||||
//testFile("audio/x-m4a", "", ""),
|
||||
//testFile("image/webp", "", ""),
|
||||
//testFile("application/vnd.wap.xhtml+xml", "", ""),
|
||||
//testFile("audio/x-aiff", "aiff", ""),
|
||||
//testFile("application/vnd.ms-spreadsheetml", "", ""),
|
||||
//testFile("image/x-airsar", "", ""),
|
||||
//testFile("application/x-pcidsk", "", ""),
|
||||
//testFile("application/x-java-pack200", "", ""),
|
||||
//testFile("image/x-fujibas", "", ""),
|
||||
//testFile("application/x-zmap", "", ""),
|
||||
//testFile("image/x-bmp", "", ""),
|
||||
//testFile("image/bpg", "", ""),
|
||||
//testFile(MIMETYPE_RTF, "rtf", ""),
|
||||
//testFile("application/x-xz", "", ""),
|
||||
//testFile("application/x-speex", "", ""),
|
||||
//testFile("audio/ogg; codecs=speex", "", ""),
|
||||
//testFile("application/x-l1b", "", ""),
|
||||
//testFile("application/x-gsbg", "", ""),
|
||||
//testFile("application/x-sdat", "", ""),
|
||||
//testFile("application/vnd.ms-visio", "", ""),
|
||||
//testFile("application/x-coredump", "", ""),
|
||||
//testFile("application/x-msaccess", "", ""),
|
||||
//testFile("application/x-dods", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_PNG, "png", "quick.png"),
|
||||
//testFile("application/vnd.ms-outlook-pst", "", ""),
|
||||
//testFile("image/bsb", "", ""),
|
||||
//testFile("application/x-cpio", "cpio", ""),
|
||||
//testFile("audio/ogg", "oga", ""),
|
||||
TestFileInfo.testFile("application/x-tar", "tar", "quick.tar"),
|
||||
//testFile("application/x-dbf", "", ""),
|
||||
//testFile("video/x-ogm", "", ""),
|
||||
//testFile("application/x-los-las", "", ""),
|
||||
//testFile("application/autocad_dwg", "", ""),
|
||||
//testFile("application/vnd.ms-excel.workspace.3", "", ""),
|
||||
//testFile("application/vnd.ms-excel.workspace.4", "", ""),
|
||||
//testFile("image/x-bpg", "", ""),
|
||||
//testFile("gzip/document", "", ""),
|
||||
//testFile("text/x-java", "", ""),
|
||||
//testFile("application/x-brotli", "", ""),
|
||||
//testFile("application/elas", "", ""),
|
||||
//testFile("image/x-jb2", "", ""),
|
||||
//testFile("application/x-cappi", "", ""),
|
||||
//testFile("application/epub+zip", "", ""),
|
||||
//testFile("application/x-ace2", "", ""),
|
||||
//testFile("application/x-sas-data", "", ""),
|
||||
//testFile("application/x-hdf", "hdf", ""),
|
||||
//testFile("image/x-mff", "", ""),
|
||||
//testFile("image/x-srp", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_BMP, "bmp", "quick.bmp"),
|
||||
//testFile("video/x-ogguvs", "", ""),
|
||||
//testFile("drawing/dwg", "", ""),
|
||||
//testFile("application/x-doq2", "", ""),
|
||||
//testFile("application/x-acad", "", ""),
|
||||
//testFile("application/x-kml", "", ""),
|
||||
//testFile("application/x-autocad", "", ""),
|
||||
//testFile("image/x-mff2", "", ""),
|
||||
//testFile("application/x-snodas", "", ""),
|
||||
//testFile("application/terragen", "", ""),
|
||||
//testFile("application/x-wcs", "", ""),
|
||||
//testFile("text/x-c++src", "", ""),
|
||||
//testFile("application/timestamped-data", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_TIFF, "tiff", "quick.tiff"),
|
||||
//testFile("application/msexcel", "", ""),
|
||||
//testFile("application/x-asp", "", ""),
|
||||
//testFile("application/x-rar-compressed", "rar", ""),
|
||||
//testFile("application/x-envi-hdr", "", ""),
|
||||
//testFile("text/iso19139+xml", "", ""),
|
||||
//testFile("application/vnd.ms-tnef", "", ""),
|
||||
//testFile("application/x-ecrg-toc", "", ""),
|
||||
//testFile("application/aig", "", ""),
|
||||
//testFile("audio/x-wav", "wav", ""),
|
||||
//testFile("image/emf", "", ""),
|
||||
//testFile("application/x-bzip", "", ""),
|
||||
//testFile("application/jdem", "", ""),
|
||||
//testFile("application/x-webp", "", ""),
|
||||
//testFile("application/x-arj", "", ""),
|
||||
//testFile("application/x-lzma", "", ""),
|
||||
//testFile("application/x-java-vm", "", ""),
|
||||
//testFile("image/envisat", "", ""),
|
||||
//testFile("application/x-doq1", "", ""),
|
||||
//testFile("audio/vnd.wave", "", ""),
|
||||
//testFile("application/x-ppi", "", ""),
|
||||
//testFile("image/ilwis", "", ""),
|
||||
//testFile("application/x-gunzip", "", ""),
|
||||
//testFile("image/x-icon", "", ""),
|
||||
//testFile("application/ogg", "ogx", ""),
|
||||
//testFile(MIMETYPE_IMAGE_SVG, "svg", ""),
|
||||
//testFile("application/x-ms-owner", "", ""),
|
||||
//testFile("application/x-grib", "", ""),
|
||||
//testFile("application/ms-tnef", "", ""),
|
||||
//testFile("image/fits", "", ""),
|
||||
//testFile("audio/x-mpeg", "", ""),
|
||||
//testFile("application/x-bzip2", "", ""),
|
||||
//testFile("text/tsv", "", ""),
|
||||
//testFile("application/x-fictionbook+xml", "", ""),
|
||||
//testFile("application/x-p-aux", "", ""),
|
||||
//testFile("application/x-font-ttf", "", ""),
|
||||
//testFile("image/x-xcf", "", ""),
|
||||
//testFile("image/x-ms-bmp", "", ""),
|
||||
//testFile("image/wmf", "", ""),
|
||||
//testFile("image/eir", "", ""),
|
||||
//testFile("application/x-matlab-data", "", ""),
|
||||
//testFile("application/deflate64", "", ""),
|
||||
//testFile("audio/wav", "", ""),
|
||||
//testFile("application/x-rs2", "", ""),
|
||||
//testFile("application/vnd.ms-word", "", ""),
|
||||
//testFile("application/x-tsx", "", ""),
|
||||
//testFile("application/x-lcp", "", ""),
|
||||
//testFile("application/x-mbtiles", "", ""),
|
||||
//testFile("audio/x-oggpcm", "", ""),
|
||||
//testFile("application/x-epsilon", "", ""),
|
||||
//testFile("application/x-msgn", "", ""),
|
||||
//testFile(MIMETYPE_TEXT_CSV, "csv", ""),
|
||||
//testFile("image/x-dimap", "", ""),
|
||||
//testFile("image/vnd.microsoft.icon", "", ""),
|
||||
//testFile("application/x-envi", "", ""),
|
||||
//testFile("application/x-dwg", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_IWORK_NUMBERS, "numbers", "quick.numbers"),
|
||||
//testFile("application/vnd.ms-word2006ml", "", ""),
|
||||
//testFile("application/x-bt", "", ""),
|
||||
//testFile("application/x-font-adobe-metric", "", ""),
|
||||
//testFile("application/x-rst", "", ""),
|
||||
//testFile("application/vrt", "", ""),
|
||||
//testFile("application/x-ctg", "", ""),
|
||||
//testFile("application/x-e00-grid", "", ""),
|
||||
//testFile("audio/x-ogg-flac", "", ""),
|
||||
//testFile("application/x-compress", "z", ""),
|
||||
//testFile("image/x-psd", "", ""),
|
||||
//testFile("text/rss", "", ""),
|
||||
//testFile("application/sdts-raster", "", ""),
|
||||
//testFile("application/oxps", "", ""),
|
||||
//testFile("application/leveller", "", ""),
|
||||
//testFile("application/x-ingr", "", ""),
|
||||
//testFile("image/sgi", "", ""),
|
||||
//testFile("application/x-pnm", "", ""),
|
||||
//testFile("image/raster", "", ""),
|
||||
//testFile("audio/x-ogg-pcm", "", ""),
|
||||
//testFile("audio/ogg; codecs=opus", "", ""),
|
||||
//testFile("application/fits", "", ""),
|
||||
//testFile("application/x-r", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_IMAGE_GIF, "gif", "quick.gif"),
|
||||
//testFile("application/java-vm", "", ""),
|
||||
//testFile("application/mspowerpoint", "", ""),
|
||||
//testFile("application/x-http", "", ""),
|
||||
//testFile("application/x-rmf", "", ""),
|
||||
//testFile("application/x-ogg", "", ""),
|
||||
//testFile("video/ogg", "ogv", "quick.ogv"),
|
||||
//testFile(MIMETYPE_APPLEFILE, "", ""),
|
||||
//testFile("text/rtf", "", ""),
|
||||
//testFile("image/adrg", "", ""),
|
||||
//testFile("video/x-ogg-rgb", "", ""),
|
||||
//testFile("application/x-ngs-geoid", "", ""),
|
||||
//testFile("application/x-map", "", ""),
|
||||
//testFile("image/ceos", "", ""),
|
||||
//testFile("application/xpm", "", ""),
|
||||
//testFile("application/x-ers", "", ""),
|
||||
//testFile("video/x-ogg-yuv", "", ""),
|
||||
//testFile("application/x-isis2", "", ""),
|
||||
//testFile("application/x-nwt-grd", "", ""),
|
||||
//testFile("application/x-isis3", "", ""),
|
||||
//testFile("application/x-nwt-grc", "", ""),
|
||||
//testFile("video/daala", "", ""),
|
||||
//testFile("application/x-blx", "", ""),
|
||||
//testFile("application/x-tnef", "", ""),
|
||||
//testFile("video/x-dirac", "", ""),
|
||||
//testFile("application/x-ndf", "", ""),
|
||||
//testFile("image/vnd.wap.wbmp", "", ""),
|
||||
//testFile("video/theora", "", ""),
|
||||
//testFile("application/kate", "", ""),
|
||||
//testFile("application/pkcs7-mime", "", ""),
|
||||
//testFile("image/fit", "", ""),
|
||||
//testFile("application/x-ctable2", "", ""),
|
||||
//testFile("application/x-executable", "", ""),
|
||||
//testFile("application/x-isatab", "", ""),
|
||||
//testFile("application/grass-ascii-grid", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_TEXT_PLAIN, "txt", "quick.txt"),
|
||||
//testFile("application/gzipped", "", ""),
|
||||
//testFile("application/x-gxf", "", ""),
|
||||
//testFile("application/x-cpg", "", ""),
|
||||
//testFile("application/x-lan", "", ""),
|
||||
//testFile("application/x-xyz", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_IWORK_PAGES, "pages", "quick.pages"),
|
||||
//testFile("image/x-jbig2", "", ""),
|
||||
//testFile("image/nitf", "", ""),
|
||||
//testFile("application/mbox", "", ""),
|
||||
//testFile("application/chm", "", ""),
|
||||
//testFile("application/x-fast", "", ""),
|
||||
//testFile("application/x-gsc", "", ""),
|
||||
//testFile("application/x-deflate", "", ""),
|
||||
//testFile("application/x-grib2", "", ""),
|
||||
//testFile("image/x-ozi", "", ""),
|
||||
//testFile("application/x-pds", "", ""),
|
||||
//testFile("application/vnd.apple.iwork", "", ""),
|
||||
//testFile("application/x-usgs-dem", "", ""),
|
||||
//testFile("application/vnd.ms-excel.sheet.2", "", ""),
|
||||
//testFile("application/vnd.ms-excel.sheet.3", "", ""),
|
||||
//testFile("application/dif+xml", "", ""),
|
||||
//testFile("application/vnd.ms-excel.sheet.4", "", ""),
|
||||
//testFile("application/x-java", "", ""),
|
||||
//testFile("image/geotiff", "", ""),
|
||||
//testFile("application/x-gsag", "", ""),
|
||||
//testFile("application/x-snappy", "", ""),
|
||||
//testFile("video/x-theora", "", ""),
|
||||
//testFile("image/ntf", "", ""),
|
||||
//testFile("application/x-pdf", "", ""),
|
||||
//testFile("application/xml", "", ""),
|
||||
//testFile("application/vnd.wordperfect; version=6.x", "", ""),
|
||||
//testFile("application/pkcs7-signature", "", ""),
|
||||
//testFile("application/vnd.wordperfect; version=5.1", "", ""),
|
||||
//testFile("application/vnd.wordperfect; version=5.0", "", ""),
|
||||
//testFile("application/x-arj-compressed", "", ""),
|
||||
//testFile("application/geotopic", "", ""),
|
||||
//testFile("text/x-java-source", "java", ""),
|
||||
//testFile("audio/basic", "au", ""),
|
||||
//testFile("application/pcisdk", "", ""),
|
||||
//testFile("application/x-rik", "", ""),
|
||||
//testFile("audio/opus", "", ""),
|
||||
//testFile(MIMETYPE_IMAGE_JP2, "jp2", ""),
|
||||
//testFile("application/x-gtx", "", ""),
|
||||
//testFile("application/x-object", "", ""),
|
||||
//testFile("application/vnd.ms-wordml", "", ""),
|
||||
//testFile("image/x-wmf", "", ""),
|
||||
//testFile("application/x-rpf-toc", "", ""),
|
||||
//testFile("application/x-srtmhgt", "", ""),
|
||||
//testFile("application/x-generic-bin", "", ""),
|
||||
//testFile("text/vnd.iptc.anpa", "", ""),
|
||||
//testFile("application/x-msmetafile", "", ""),
|
||||
//testFile("application/x-wms", "", ""),
|
||||
//testFile("video/x-oggrgb", "", ""),
|
||||
//testFile("image/xcf", "", ""),
|
||||
//testFile("application/photoshop", "", ""),
|
||||
//testFile("application/x-lz4", "", ""),
|
||||
//testFile("application/x-7z-compressed", "", ""),
|
||||
//testFile("application/gff", "", ""),
|
||||
//testFile("video/x-oggyuv", "", ""),
|
||||
//testFile("application/x-msdownload", "", ""),
|
||||
//testFile("image/icns", "", ""),
|
||||
//testFile("application/x-emf", "", ""),
|
||||
//testFile("application/x-geo-pdf", "", ""),
|
||||
//testFile("video/x-ogg-uvs", "", ""),
|
||||
TestFileInfo.testFile(MIMETYPE_VIDEO_FLV, "flv", "quick.flv"),
|
||||
//testFile("application/x-zip-compressed", "", ""),
|
||||
//testFile("application/gzip", "", ""),
|
||||
//testFile("application/x-tika-unix-dump", "", ""),
|
||||
//testFile("application/x-coasp", "", ""),
|
||||
//testFile("application/x-dipex", "", ""),
|
||||
//testFile("application/x-til", "", ""),
|
||||
//testFile("application/x-gzip", "gzip", ""),
|
||||
//testFile("application/x-gs7bg", "", ""),
|
||||
//testFile("application/x-unix-archive", "", ""),
|
||||
//testFile("application/x-elf", "", ""),
|
||||
//testFile("application/dted", "", ""),
|
||||
//testFile("application/x-rasterlite", "", ""),
|
||||
//testFile("audio/x-mp4a", "", ""),
|
||||
//testFile("application/x-gzip-compressed", "", ""),
|
||||
//testFile("application/x-chm", "", ""),
|
||||
//testFile("image/hfa", "", ""),
|
||||
|
||||
// Special test cases from the repo tests
|
||||
// ======================================
|
||||
|
||||
// Test for MNT-577: Alfresco is running 100% CPU for over 10 minutes while extracting metadata for
|
||||
// Word office document
|
||||
// testFile(MIMETYPE_OPENXML_WORDPROCESSING, "docx", "problemFootnotes2.docx")
|
||||
|
||||
// Test MNT-15219 Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may
|
||||
// cause OutOfMemory in Tika Note - doesn't use extractFromMimetype
|
||||
TestFileInfo.testFile(MIMETYPE_OPENXML_SPREADSHEET, "xlsx", "dmsu1332-reproduced.xlsx")
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("tika2_2_1_upgradeFailures")
|
||||
public void testTika_2_2_1_upgradeFailures(TestFileInfo testFileInfo)
|
||||
{
|
||||
super.testTransformation(testFileInfo);
|
||||
}
|
||||
|
||||
private static Stream<TestFileInfo> tika2_2_1_upgradeFailures()
|
||||
{
|
||||
// When we upgraded to Tika 2.2.1 from 2.2.0:
|
||||
// - the original OfficeOpenXMLCore.SUBJECT raw metadata value started being null.
|
||||
// - the replacement TikaCoreProperties.SUBJECT raw metadata changed into a multi value
|
||||
// The following test files were the ones that failed.
|
||||
return Stream.of(
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_GRAPHICS_TEMPLATE, "otg", "quick.otg"),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENOFFICE1_WRITER, "sxw", "quick.sxw"),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_GRAPHICS, "odg", "quick.odg"),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_TEXT, "odt", "quick.odt"),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_TEXT_TEMPLATE, "ott", "quick.ott"),
|
||||
TestFileInfo.testFile(MIMETYPE_OPENDOCUMENT_FORMULA, "odf", "quick.odf"),
|
||||
TestFileInfo.testFile(MIMETYPE_PDF, "pdf", "quick.pdf")
|
||||
);
|
||||
}
|
||||
}
|
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_WORDPROCESSING;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
import org.alfresco.transform.client.model.TransformRequest;
|
||||
import org.alfresco.transform.base.AbstractQueueTransformServiceIT;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
|
||||
/**
|
||||
* @author Lucian Tuca
|
||||
* created on 15/01/2019
|
||||
*/
|
||||
@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT,
|
||||
properties = {"activemq.url=nio://localhost:61616"})
|
||||
public class TikaQueueTransformServiceIT extends AbstractQueueTransformServiceIT
|
||||
{
|
||||
@Override
|
||||
protected TransformRequest buildRequest()
|
||||
{
|
||||
return TransformRequest
|
||||
.builder()
|
||||
.withRequestId(UUID.randomUUID().toString())
|
||||
.withSourceMediaType(MIMETYPE_OPENXML_WORDPROCESSING)
|
||||
.withTargetMediaType(MIMETYPE_TEXT_PLAIN)
|
||||
.withTargetExtension("txt")
|
||||
.withSchema(1)
|
||||
.withClientData("ACS")
|
||||
.withSourceReference(UUID.randomUUID().toString())
|
||||
.withSourceSize(32L).build();
|
||||
}
|
||||
}
|
@@ -0,0 +1,177 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import static java.text.MessageFormat.format;
|
||||
import static java.util.function.Function.identity;
|
||||
import static org.alfresco.transform.base.EngineClient.sendTRequest;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
import static org.springframework.http.HttpStatus.OK;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
|
||||
import org.alfresco.transform.base.EngineClient;
|
||||
import org.apache.commons.lang3.tuple.Triple;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.core.io.Resource;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
|
||||
/**
|
||||
* @author Cezar Leahu
|
||||
*/
|
||||
public class TikaTransformationIT
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TikaTransformationIT.class);
|
||||
private static final String ENGINE_URL = "http://localhost:8090";
|
||||
private static final Map<String, String> extensionMimetype = ImmutableMap.of(
|
||||
"html", "text/html",
|
||||
"txt", "text/plain",
|
||||
"xhtml", "application/xhtml+xml",
|
||||
"xml", "text/xml");
|
||||
|
||||
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("engineTransformations")
|
||||
public void testTransformation(Triple<String, String, String> entry)
|
||||
{
|
||||
final String sourceFile = entry.getLeft();
|
||||
final String sourceMimetype = entry.getRight();
|
||||
final String targetExtension = entry.getMiddle();
|
||||
String targetMimetype;
|
||||
//Single test to cover pdf-->csv
|
||||
if (sourceFile.contains("pdf") && targetExtension.contains("csv"))
|
||||
{
|
||||
targetMimetype = "text/csv";
|
||||
}
|
||||
else
|
||||
{
|
||||
targetMimetype = extensionMimetype.get(entry.getMiddle());
|
||||
}
|
||||
|
||||
|
||||
final String descriptor = format("Transform ({0}, {1} -> {2}, {3})",
|
||||
sourceFile, sourceMimetype, targetMimetype, targetExtension);
|
||||
try
|
||||
{
|
||||
final ResponseEntity<Resource> response = EngineClient.sendTRequest(ENGINE_URL, sourceFile, null,
|
||||
targetMimetype, targetExtension, ImmutableMap.of(
|
||||
"targetEncoding", "UTF-8",
|
||||
"sourceMimetype", sourceMimetype));
|
||||
assertEquals(OK, response.getStatusCode(), descriptor);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
fail(descriptor + " exception: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private static Stream<Triple<String, String, String>> allTargets(final String sourceFile,
|
||||
final String sourceMimetype)
|
||||
{
|
||||
return extensionMimetype
|
||||
.keySet()
|
||||
.stream()
|
||||
.map(k -> Triple.of(sourceFile, k, sourceMimetype));
|
||||
}
|
||||
|
||||
// TODO unit tests for the following file types (for which is difficult to find file samples):
|
||||
// *.ogx (application/ogg)
|
||||
// *.cpio (application/x-cpio)
|
||||
// *.cdf (application/x-netcdf)
|
||||
// *.hdf (application/x-hdf)
|
||||
public static Stream<Triple<String, String, String>> engineTransformations()
|
||||
{
|
||||
return Stream
|
||||
.of(
|
||||
allTargets("quick.doc", "application/msword"),
|
||||
allTargets("quick.docx",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
|
||||
allTargets("quick.html", "text/html"),
|
||||
allTargets("quick.jar", "application/java-archive"),
|
||||
allTargets("quick.java", "text/x-java-source"),
|
||||
Stream.of(
|
||||
Triple.of("quick.key", "html", "application/vnd.apple.keynote"),
|
||||
// Does not work, alfresco-docker-sourceMimetype-misc can handle this target mimetype, removed from engine_config.json
|
||||
Triple.of("quick.key", "txt", "application/vnd.apple.keynote"),
|
||||
Triple.of("quick.key", "xhtml", "application/vnd.apple.keynote"),
|
||||
Triple.of("quick.key", "xml", "application/vnd.apple.keynote")
|
||||
),
|
||||
allTargets("quick.msg", "application/vnd.ms-outlook"),
|
||||
Stream.of(
|
||||
Triple.of("quick.numbers", "html", "application/vnd.apple.numbers"),
|
||||
// Does not work, alfresco-docker-sourceMimetype-misc can handle this target mimetype, removed from engine_config.json
|
||||
// Triple.of("quick.numbers", "txt", "TikaAuto"),
|
||||
Triple.of("quick.numbers", "xhtml", "application/vnd.apple.numbers"),
|
||||
Triple.of("quick.numbers", "xml", "application/vnd.apple.numbers")
|
||||
),
|
||||
Stream.of(
|
||||
Triple.of("quick.pdf", "csv", "application/pdf")
|
||||
),
|
||||
allTargets("quick.odp", "application/vnd.oasis.opendocument.presentation"),
|
||||
allTargets("quick.ods", "application/vnd.oasis.opendocument.spreadsheet"),
|
||||
allTargets("quick.odt", "application/vnd.oasis.opendocument.text"),
|
||||
allTargets("quick.otp", "application/vnd.oasis.opendocument.presentation-template"),
|
||||
allTargets("quick.ots", "application/vnd.oasis.opendocument.spreadsheet-template"),
|
||||
allTargets("quick.ott", "application/vnd.oasis.opendocument.text-template"),
|
||||
Stream.of(
|
||||
Triple.of("quick.pages", "html", "application/vnd.apple.pages"),
|
||||
// Does not work, alfresco-docker-sourceMimetype-misc can handle this target mimetype, removed from engine_config.json
|
||||
// Triple.of("quick.pages", "txt", "TikaAuto"),
|
||||
Triple.of("quick.pages", "xhtml", "application/vnd.apple.pages"),
|
||||
Triple.of("quick.pages", "xml", "application/vnd.apple.pages")
|
||||
),
|
||||
allTargets("quick.pdf", "application/pdf"),
|
||||
allTargets("quick.ppt", "application/vnd.ms-powerpoint"),
|
||||
allTargets("quick.pptx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
|
||||
allTargets("quick.sxw", "application/vnd.sun.xml.writer"),
|
||||
allTargets("quick.txt", "text/plain"),
|
||||
allTargets("quick.vsd", "application/vnd.visio"),
|
||||
allTargets("quick.xls", "application/vnd.ms-excel"),
|
||||
allTargets("quick.xslx",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
|
||||
allTargets("quick.zip", "application/zip"),
|
||||
allTargets("quick.tar", "application/x-tar"),
|
||||
allTargets("sample.rtf", "application/rtf"),
|
||||
allTargets("quick.xml", "text/xml"),
|
||||
allTargets("sample.xhtml.txt", "application/xhtml+xml"),
|
||||
allTargets("sample.rss", "application/rss+xml"),
|
||||
//allTargets("quick.rar", "application/x-rar-compressed"),
|
||||
allTargets("quick.z", "application/x-compress"),
|
||||
allTargets("quick.csv", "text/csv"),
|
||||
allTargets("quick.tar.gz", "application/x-gzip"))
|
||||
.flatMap(identity());
|
||||
}
|
||||
}
|
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class IPTCMetadataExtractorTest
|
||||
{
|
||||
IPTCMetadataExtractor extractor = new IPTCMetadataExtractor();
|
||||
|
||||
@Test
|
||||
public void testIptcToIso8601DateStrings() {
|
||||
String[] testStrings = { "1890:01:01", "1901:02:01 00:00:00.000Z", "1901-02-01 00:00:00.000Z",
|
||||
"1901-02-01T00:00:00.000Z", "1901:02:01T00:00+00:00", "1901:02:01 00:00+00:00" };
|
||||
String[] expected = { "1890-01-01", "1901-02-01T00:00:00.000Z", "1901-02-01T00:00:00.000Z",
|
||||
"1901-02-01T00:00:00.000Z", "1901-02-01T00:00+00:00", "1901-02-01T00:00+00:00" };
|
||||
|
||||
assertArrayEquals(expected, extractor.iptcToIso8601DateStrings(testStrings));
|
||||
}
|
||||
}
|
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.parsers;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class ExifToolParserTest {
|
||||
|
||||
ExifToolParser exifToolParser = new ExifToolParser();
|
||||
|
||||
@Test
|
||||
public void testFindSeparator() {
|
||||
|
||||
String testCommand = "env FOO=${OUTPUT} exiftool -args -G1 " + ExifToolParser.SEPARATOR_SETTING
|
||||
+ " \"|||\" ${INPUT}";
|
||||
String expected = "|||";
|
||||
String actual = exifToolParser.findSeparator(testCommand);
|
||||
assertEquals(expected, actual);
|
||||
|
||||
expected = "TESTWITHOUTQUOTES";
|
||||
testCommand = "nothing matters until the " + ExifToolParser.SEPARATOR_SETTING + " " + expected
|
||||
+ " now all this extra should be ignored";
|
||||
actual = exifToolParser.findSeparator(testCommand);
|
||||
assertEquals(expected, actual);
|
||||
|
||||
expected = "Test something bonkers 112!£$%^£$^";
|
||||
testCommand = ExifToolParser.SEPARATOR_SETTING + " \""+expected+"\"";
|
||||
actual = exifToolParser.findSeparator(testCommand);
|
||||
assertEquals(expected, actual);
|
||||
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,140 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import static org.alfresco.transform.tika.transformers.Tika.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TARGET_ENCODING;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TARGET_MIMETYPE;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.clearInvocations;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.spy;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class GenericTikaTransformerTest
|
||||
{
|
||||
private class TikaTestTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
TikaTestTransformer(boolean notExtractBookmarksTextDefault)
|
||||
{
|
||||
this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault;
|
||||
}
|
||||
};
|
||||
|
||||
@Test
|
||||
public void testNotExtractBookmarkTextDefault() throws Exception
|
||||
{
|
||||
GenericTikaTransformer executorSpyDefaultTrue = spy(new TikaTestTransformer(true));
|
||||
GenericTikaTransformer executorSpyDefaultFalse = spy(new TikaTestTransformer(false));
|
||||
|
||||
File mockSourceFile = mock(File.class);
|
||||
File mockTargetFile = mock(File.class);
|
||||
String transformName = "transformName";
|
||||
String sourceMimetype = "sourceMimetype";
|
||||
String targetMimetype = "targetMimetype";
|
||||
String defaultEncoding = "UTF-8";
|
||||
|
||||
// no need to continue execution passed here or check values as we're checking the correct params passed to this method later.
|
||||
lenient().doNothing().when(executorSpyDefaultTrue).call(any(), any(), any(), any(), any(), any(), any());
|
||||
lenient().doNothing().when(executorSpyDefaultFalse).call(any(), any(), any(), any(), any(), any(), any());
|
||||
|
||||
Map<String, String> transformOptions = new HashMap<>();
|
||||
|
||||
// use empty transformOptions to test defaults
|
||||
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
|
||||
mockSourceFile, mockTargetFile);
|
||||
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
|
||||
mockSourceFile, mockTargetFile);
|
||||
|
||||
// when default set to true, with no options passed we should get a call method with NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// when default set to false, with no options passed we should get a call method without NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// use transforms with notExtractBookmarksText set to true
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.put("notExtractBookmarksText", "true");
|
||||
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
|
||||
mockSourceFile, mockTargetFile);
|
||||
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
|
||||
mockSourceFile, mockTargetFile);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null,
|
||||
NOT_EXTRACT_BOOKMARKS_TEXT, TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// use transforms with notExtractBookmarksText set to false
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.replace("notExtractBookmarksText", "true", "false");
|
||||
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
|
||||
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + defaultEncoding);
|
||||
|
||||
// use full set of pdfbox transformOptions just to be safe
|
||||
clearInvocations(executorSpyDefaultTrue, executorSpyDefaultFalse);
|
||||
transformOptions.put("targetEncoding", "anyEncoding");
|
||||
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
|
||||
executorSpyDefaultFalse.transform(transformName, sourceMimetype, targetMimetype, transformOptions, mockSourceFile, mockTargetFile);
|
||||
|
||||
// both call methods should have NOT_EXTRACT_BOOKMARKS_TEXT but the encoding will change
|
||||
verify(executorSpyDefaultTrue, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding");
|
||||
|
||||
verify(executorSpyDefaultFalse, times(1)).call(mockSourceFile, mockTargetFile, transformName, null, null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + "anyEncoding");
|
||||
}
|
||||
}
|
Binary file not shown.
@@ -0,0 +1,92 @@
|
||||
{
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedSublocation" : "Kidlington",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerName" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourceType" : "ARW",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}IptcLastEdited" : "2021:05:27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCtry" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}ModelAge" : [ "25", "153", "3" ],
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedProvinceState" : "New York",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorStreetAddress" : "63 Windsor Road",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://ns.adobe.com/photoshop/1.0/}TransmissionReference" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownSublocation" : "Kidlington",
|
||||
"{http://purl.org/dc/elements/1.1/}description" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSource" : "Fox sources",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}State" : "New York",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorRegion" : "Maidenhead",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegOrgId" : "id48485",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiTelWork" : "01865 513465",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourcefileType" : "ARW",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCopyrightNotice" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSourceInvNo" : "4845484",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorName" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}CVterm" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryName" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerID" : "id54154",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Category" : "Comedy",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageName" : "Alfresco",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierName" : "Alfresco",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}SupplementalCategories" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Scene" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrExtadr" : "Kidlington",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Headline" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}SubjectCode" : "fodijfsij5454",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}City" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorEmail" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}Version" : "1.1",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AddlModelInfo" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownWorldRegion" : "Soth West",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorID" : "id548454",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ModelReleaseID" : "id4845151",
|
||||
"{http://purl.org/dc/elements/1.1/}subject" : "-fox -dog -lazy -jumping",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigImageGUID" : "49848484",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}CaptionWriter" : "Ayman",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorExtendedAddress" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOTitle" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}DateCreated" : "2021-05-27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierImageID" : "id8454841",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedWorldRegion" : "South West",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone1" : "01865 465986",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Source" : "Fox sources",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierID" : "id659568",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone2" : "01865 465986",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrRegion" : "Oxon",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrPcode" : "OX132EN",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorPostalCode" : "OX136XN",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Urgency " : "5 (normal urgency)",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorID" : "id87515454",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiEmailWork" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryName" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageCode" : "546851381",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AODateCreated" : "2021-05-27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}PersonInImage" : "Ayman",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorCity" : "Maidenhead",
|
||||
"{http://purl.org/dc/elements/1.1/}creator" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownProvinceState" : "New York",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}Event" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegItemId" : "id48485",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorCountry" : "England",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}AuthorsPosition" : "Left Center",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Credit" : "Visa",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}IntellectualGenre" : "Comedy",
|
||||
"{http://purl.org/dc/elements/1.1/}rights" : "Fox rights",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailWidth" : "300",
|
||||
"{http://purl.org/dc/elements/1.1/}title" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailHeight" : "400",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://ns.adobe.com/xap/1.0/rights/}UsageTerms" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Location" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}PropertyReleaseID" : "id841584",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Country" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCreator" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiUrlWork" : "01865 513465",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Instructions" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorName" : "Alfresco",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorURL" : "www.alfresco.com"
|
||||
}
|
BIN
engines/tika/src/test/resources/20140629_145035_Flower.RW2
Normal file
BIN
engines/tika/src/test/resources/20140629_145035_Flower.RW2
Normal file
Binary file not shown.
@@ -0,0 +1,92 @@
|
||||
{
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedSublocation" : "Kidlington",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerName" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourceType" : "RW2",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}IptcLastEdited" : "2021:05:27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCtry" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}ModelAge" : [ "25", "153", "3" ],
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedProvinceState" : "New York",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorStreetAddress" : "63 Windsor Road",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://ns.adobe.com/photoshop/1.0/}TransmissionReference" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownSublocation" : "Kidlington",
|
||||
"{http://purl.org/dc/elements/1.1/}description" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSource" : "Fox sources",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}State" : "New York",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorRegion" : "Maidenhead",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegOrgId" : "id48485",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiTelWork" : "01865 513465",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourcefileType" : "RW2",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCopyrightNotice" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSourceInvNo" : "4845484",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorName" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}CVterm" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryName" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerID" : "id54154",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Category" : "Comedy",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageName" : "Alfresco",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierName" : "Alfresco",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}SupplementalCategories" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Scene" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrExtadr" : "Kidlington",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Headline" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}SubjectCode" : "fodijfsij5454",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}City" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorEmail" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}Version" : "1.1",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AddlModelInfo" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownWorldRegion" : "Soth West",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorID" : "id548454",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ModelReleaseID" : "id4845151",
|
||||
"{http://purl.org/dc/elements/1.1/}subject" : "-fox -dog -lazy -jumping",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigImageGUID" : "49848484",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}CaptionWriter" : "Ayman",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorExtendedAddress" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOTitle" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}DateCreated" : "2021-05-27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierImageID" : "id8454841",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedWorldRegion" : "South West",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone1" : "01865 465986",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Source" : "Fox sources",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierID" : "id659568",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone2" : "01865 465986",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrRegion" : "Oxon",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrPcode" : "OX132EN",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorPostalCode" : "OX136XN",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Urgency " : "5 (normal urgency)",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorID" : "id87515454",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiEmailWork" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryName" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageCode" : "546851381",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AODateCreated" : "2021-05-27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}PersonInImage" : "Ayman",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorCity" : "Maidenhead",
|
||||
"{http://purl.org/dc/elements/1.1/}creator" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownProvinceState" : "New York",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}Event" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegItemId" : "id48485",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorCountry" : "England",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}AuthorsPosition" : "Left Center",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Credit" : "Visa",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}IntellectualGenre" : "Comedy",
|
||||
"{http://purl.org/dc/elements/1.1/}rights" : "Fox rights",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailWidth" : "300",
|
||||
"{http://purl.org/dc/elements/1.1/}title" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailHeight" : "400",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://ns.adobe.com/xap/1.0/rights/}UsageTerms" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Location" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}PropertyReleaseID" : "id841584",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Country" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCreator" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiUrlWork" : "01865 513465",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Instructions" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorName" : "Alfresco",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorURL" : "www.alfresco.com"
|
||||
}
|
BIN
engines/tika/src/test/resources/20141227_134519_Palace.CR2
Normal file
BIN
engines/tika/src/test/resources/20141227_134519_Palace.CR2
Normal file
Binary file not shown.
@@ -0,0 +1,92 @@
|
||||
{
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedSublocation" : "Kidlington",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerName" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourceType" : "CR2",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}IptcLastEdited" : "2021:05:27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCtry" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}ModelAge" : [ "25", "153", "3" ],
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedProvinceState" : "New York",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorStreetAddress" : "63 Windsor Road",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://ns.adobe.com/photoshop/1.0/}TransmissionReference" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownSublocation" : "Kidlington",
|
||||
"{http://purl.org/dc/elements/1.1/}description" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSource" : "Fox sources",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}State" : "New York",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorRegion" : "Maidenhead",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegOrgId" : "id48485",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiTelWork" : "01865 513465",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourcefileType" : "CR2",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCopyrightNotice" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSourceInvNo" : "4845484",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorName" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}CVterm" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryName" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerID" : "id54154",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Category" : "Comedy",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageName" : "Alfresco",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierName" : "Alfresco",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}SupplementalCategories" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Scene" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrExtadr" : "Kidlington",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Headline" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}SubjectCode" : "fodijfsij5454",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}City" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorEmail" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}Version" : "1.1",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AddlModelInfo" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownWorldRegion" : "Soth West",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorID" : "id548454",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ModelReleaseID" : "id4845151",
|
||||
"{http://purl.org/dc/elements/1.1/}subject" : "-fox -dog -lazy -jumping",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigImageGUID" : "49848484",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}CaptionWriter" : "Ayman",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorExtendedAddress" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOTitle" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}DateCreated" : "2021-05-27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierImageID" : "id8454841",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedWorldRegion" : "South West",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone1" : "01865 465986",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Source" : "Fox sources",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierID" : "id659568",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone2" : "01865 465986",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrRegion" : "Oxon",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrPcode" : "OX132EN",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorPostalCode" : "OX136XN",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Urgency " : "5 (normal urgency)",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorID" : "id87515454",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiEmailWork" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryName" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageCode" : "546851381",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AODateCreated" : "2021-05-27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}PersonInImage" : "Ayman",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorCity" : "Maidenhead",
|
||||
"{http://purl.org/dc/elements/1.1/}creator" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownProvinceState" : "New York",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}Event" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegItemId" : "id48485",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorCountry" : "England",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}AuthorsPosition" : "Left Center",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Credit" : "Visa",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}IntellectualGenre" : "Comedy",
|
||||
"{http://purl.org/dc/elements/1.1/}rights" : "Fox rights",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailWidth" : "300",
|
||||
"{http://purl.org/dc/elements/1.1/}title" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailHeight" : "400",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://ns.adobe.com/xap/1.0/rights/}UsageTerms" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Location" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}PropertyReleaseID" : "id841584",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Country" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCreator" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiUrlWork" : "01865 513465",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Instructions" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorName" : "Alfresco",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorURL" : "www.alfresco.com"
|
||||
}
|
BIN
engines/tika/src/test/resources/20150408_074941_Bush.NEF
Normal file
BIN
engines/tika/src/test/resources/20150408_074941_Bush.NEF
Normal file
Binary file not shown.
@@ -0,0 +1,92 @@
|
||||
{
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedSublocation" : "Kidlington",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerName" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourceType" : "NEF",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}IptcLastEdited" : "2021:05:27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCtry" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}ModelAge" : [ "25", "153", "3" ],
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedProvinceState" : "New York",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorStreetAddress" : "63 Windsor Road",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://ns.adobe.com/photoshop/1.0/}TransmissionReference" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownSublocation" : "Kidlington",
|
||||
"{http://purl.org/dc/elements/1.1/}description" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSource" : "Fox sources",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}State" : "New York",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorRegion" : "Maidenhead",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegOrgId" : "id48485",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiTelWork" : "01865 513465",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourcefileType" : "NEF",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCopyrightNotice" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSourceInvNo" : "4845484",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorName" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}CVterm" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryName" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerID" : "id54154",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Category" : "Comedy",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageName" : "Alfresco",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierName" : "Alfresco",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}SupplementalCategories" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Scene" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrExtadr" : "Kidlington",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Headline" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}SubjectCode" : "fodijfsij5454",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}City" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorEmail" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}Version" : "1.1",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AddlModelInfo" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownWorldRegion" : "Soth West",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorID" : "id548454",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ModelReleaseID" : "id4845151",
|
||||
"{http://purl.org/dc/elements/1.1/}subject" : "-fox -dog -lazy -jumping",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigImageGUID" : "49848484",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}CaptionWriter" : "Ayman",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorExtendedAddress" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOTitle" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}DateCreated" : "2021-05-27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierImageID" : "id8454841",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedWorldRegion" : "South West",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone1" : "01865 465986",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Source" : "Fox sources",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierID" : "id659568",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone2" : "01865 465986",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrRegion" : "Oxon",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrPcode" : "OX132EN",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorPostalCode" : "OX136XN",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Urgency " : "5 (normal urgency)",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorID" : "id87515454",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiEmailWork" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryName" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageCode" : "546851381",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AODateCreated" : "2021-05-27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}PersonInImage" : "Ayman",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorCity" : "Maidenhead",
|
||||
"{http://purl.org/dc/elements/1.1/}creator" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownProvinceState" : "New York",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}Event" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegItemId" : "id48485",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorCountry" : "England",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}AuthorsPosition" : "Left Center",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Credit" : "Visa",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}IntellectualGenre" : "Comedy",
|
||||
"{http://purl.org/dc/elements/1.1/}rights" : "Fox rights",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailWidth" : "300",
|
||||
"{http://purl.org/dc/elements/1.1/}title" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailHeight" : "400",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://ns.adobe.com/xap/1.0/rights/}UsageTerms" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Location" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}PropertyReleaseID" : "id841584",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Country" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCreator" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiUrlWork" : "01865 513465",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Instructions" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorName" : "Alfresco",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorURL" : "www.alfresco.com"
|
||||
}
|
Binary file not shown.
@@ -0,0 +1,92 @@
|
||||
{
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedSublocation" : "Kidlington",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerName" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourceType" : "RAF",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}IptcLastEdited" : "2021:05:27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrCtry" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}ModelAge" : [ "25", "153", "3" ],
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedProvinceState" : "New York",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorStreetAddress" : "63 Windsor Road",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://ns.adobe.com/photoshop/1.0/}TransmissionReference" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownSublocation" : "Kidlington",
|
||||
"{http://purl.org/dc/elements/1.1/}description" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSource" : "Fox sources",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}State" : "New York",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorRegion" : "Maidenhead",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegOrgId" : "id48485",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiTelWork" : "01865 513465",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigitalSourcefileType" : "RAF",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCopyrightNotice" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOSourceInvNo" : "4845484",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorName" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}CVterm" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCountryName" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}CopyrightOwnerID" : "id54154",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Category" : "Comedy",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageName" : "Alfresco",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierName" : "Alfresco",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}SupplementalCategories" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Scene" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrExtadr" : "Kidlington",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Headline" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}SubjectCode" : "fodijfsij5454",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}City" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorEmail" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}Version" : "1.1",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AddlModelInfo" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownWorldRegion" : "Soth West",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageCreatorID" : "id548454",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ModelReleaseID" : "id4845151",
|
||||
"{http://purl.org/dc/elements/1.1/}subject" : "-fox -dog -lazy -jumping",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}DigImageGUID" : "49848484",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}CaptionWriter" : "Ayman",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorExtendedAddress" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOTitle" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}DateCreated" : "2021-05-27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedCity" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierImageID" : "id8454841",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationCreatedWorldRegion" : "South West",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone1" : "01865 465986",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Source" : "Fox sources",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}ImageSupplierID" : "id659568",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorTelephone2" : "01865 465986",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrRegion" : "Oxon",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiAdrPcode" : "OX132EN",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorPostalCode" : "OX136XN",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Urgency " : "5 (normal urgency)",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorID" : "id87515454",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiEmailWork" : "DiegoS@alfrescodev.onmicrosoft.com",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryName" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}OrganisationInImageCode" : "546851381",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AODateCreated" : "2021-05-27",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}PersonInImage" : "Ayman",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorCity" : "Maidenhead",
|
||||
"{http://purl.org/dc/elements/1.1/}creator" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownProvinceState" : "New York",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}Event" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}RegItemId" : "id48485",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorCountry" : "England",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}AuthorsPosition" : "Left Center",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Credit" : "Visa",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}IntellectualGenre" : "Comedy",
|
||||
"{http://purl.org/dc/elements/1.1/}rights" : "Fox rights",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailWidth" : "300",
|
||||
"{http://purl.org/dc/elements/1.1/}title" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}LocationShownCountryCode" : "+44",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}MaxAvailHeight" : "400",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://ns.adobe.com/xap/1.0/rights/}UsageTerms" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Location" : "Oxford",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}PropertyReleaseID" : "id841584",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Country" : "England",
|
||||
"{http://iptc.org/std/Iptc4xmpExt/2008-02-29/}AOCreator" : "Ayman",
|
||||
"{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}CiUrlWork" : "01865 513465",
|
||||
"{http://ns.adobe.com/photoshop/1.0/}Instructions" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorName" : "Alfresco",
|
||||
"{http://ns.useplus.org/ldf/xmp/1.0/}LicensorURL" : "www.alfresco.com"
|
||||
}
|
BIN
engines/tika/src/test/resources/dmsu1332-reproduced.xlsx
Normal file
BIN
engines/tika/src/test/resources/dmsu1332-reproduced.xlsx
Normal file
Binary file not shown.
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : "2016-03-29T21:01:55Z",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : "Udintsev, Anton (external - Project)",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null
|
||||
}
|
22
engines/tika/src/test/resources/engine_config_complete.json
Normal file
22
engines/tika/src/test/resources/engine_config_complete.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"transformOptions": {
|
||||
"engineXOptions": [
|
||||
{"value": {"name": "page"}},
|
||||
{"value": {"name": "width"}},
|
||||
{"group": {"transformOptions": [
|
||||
{"value": {"name": "cropGravity"}}
|
||||
]}}
|
||||
]
|
||||
},
|
||||
"transformers": [
|
||||
{
|
||||
"transformerName": "engineX",
|
||||
"supportedSourceAndTargetList": [
|
||||
{"sourceMediaType": "application/pdf", "targetMediaType": "image/png" }
|
||||
],
|
||||
"transformOptions": [
|
||||
"engineXOptions"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"transformOptions": {},
|
||||
"transformers": [
|
||||
{
|
||||
"supportedSourceAndTargetList": [
|
||||
{"sourceMediaType": "application/pdf", "targetMediaType": "image/png" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"transformers": [
|
||||
{
|
||||
"transformerName": "engineX",
|
||||
"supportedSourceAndTargetList": [
|
||||
{"sourceMediaType": "application/pdf", "targetMediaType": "image/png" }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"transformOptions": {
|
||||
"engineXOptions": [
|
||||
{"value": {"name": "page"}},
|
||||
{"value": {"name": "page"}},
|
||||
{"value": {"name": "width"}},
|
||||
{"group": {"transformOptions": [
|
||||
{"value": {"name": "cropGravity"}}
|
||||
]}}
|
||||
]
|
||||
},
|
||||
"transformers": [
|
||||
{
|
||||
"transformerName": "engineX",
|
||||
"supportedSourceAndTargetList": [
|
||||
{"sourceMediaType": "application/pdf", "targetMediaType": "image/png" },
|
||||
{"sourceMediaType": "application/pdf", "targetMediaType": "image/png" },
|
||||
{"sourceMediaType": "application/pdf", "targetMediaType": "image/png" }
|
||||
],
|
||||
"transformOptions": [
|
||||
"engineXOptions",
|
||||
"engineXOptions"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
BIN
engines/tika/src/test/resources/problemFootnotes2.docx
Normal file
BIN
engines/tika/src/test/resources/problemFootnotes2.docx
Normal file
Binary file not shown.
BIN
engines/tika/src/test/resources/quick.3g2
Normal file
BIN
engines/tika/src/test/resources/quick.3g2
Normal file
Binary file not shown.
9
engines/tika/src/test/resources/quick.3g2_metadata.json
Normal file
9
engines/tika/src/test/resources/quick.3g2_metadata.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "8000",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Stereo"
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.3gp
Normal file
BIN
engines/tika/src/test/resources/quick.3gp
Normal file
Binary file not shown.
9
engines/tika/src/test/resources/quick.3gp_metadata.json
Normal file
9
engines/tika/src/test/resources/quick.3gp_metadata.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "8000",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Stereo"
|
||||
}
|
3888
engines/tika/src/test/resources/quick.ai
Normal file
3888
engines/tika/src/test/resources/quick.ai
Normal file
File diff suppressed because one or more lines are too long
5
engines/tika/src/test/resources/quick.ai_metadata.json
Normal file
5
engines/tika/src/test/resources/quick.ai_metadata.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : "2011-05-17T13:34:11Z",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : "Nevin Nollop",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : "test file cs5"
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.bmp
Normal file
BIN
engines/tika/src/test/resources/quick.bmp
Normal file
Binary file not shown.
After Width: | Height: | Size: 110 KiB |
6
engines/tika/src/test/resources/quick.bmp_metadata.json
Normal file
6
engines/tika/src/test/resources/quick.bmp_metadata.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/exif/1.0}pixelYDimension" : "92",
|
||||
"{http://www.alfresco.org/model/exif/1.0}pixelXDimension" : "409",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null
|
||||
}
|
1
engines/tika/src/test/resources/quick.csv
Normal file
1
engines/tika/src/test/resources/quick.csv
Normal file
@@ -0,0 +1 @@
|
||||
"The quick brown fox jumps over the lazy dog"
|
|
BIN
engines/tika/src/test/resources/quick.doc
Normal file
BIN
engines/tika/src/test/resources/quick.doc
Normal file
Binary file not shown.
7
engines/tika/src/test/resources/quick.doc_metadata.json
Normal file
7
engines/tika/src/test/resources/quick.doc_metadata.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}modified" : "2005-09-20T17:25:00Z",
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : "Gym class featuring a brown fox and lazy dog",
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : "2005-05-26T12:57:00Z",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : "Nevin Nollop",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog"
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.docx
Normal file
BIN
engines/tika/src/test/resources/quick.docx
Normal file
Binary file not shown.
6
engines/tika/src/test/resources/quick.docx_metadata.json
Normal file
6
engines/tika/src/test/resources/quick.docx_metadata.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : "Gym class featuring a brown fox and lazy dog",
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : "2010-01-06T17:32:00Z",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : "Nevin Nollop",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog"
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.flv
Normal file
BIN
engines/tika/src/test/resources/quick.flv
Normal file
Binary file not shown.
4
engines/tika/src/test/resources/quick.flv_metadata.json
Normal file
4
engines/tika/src/test/resources/quick.flv_metadata.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.gif
Normal file
BIN
engines/tika/src/test/resources/quick.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 11 KiB |
6
engines/tika/src/test/resources/quick.gif_metadata.json
Normal file
6
engines/tika/src/test/resources/quick.gif_metadata.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/exif/1.0}pixelYDimension" : "92",
|
||||
"{http://www.alfresco.org/model/exif/1.0}pixelXDimension" : "409",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null
|
||||
}
|
17
engines/tika/src/test/resources/quick.html
Normal file
17
engines/tika/src/test/resources/quick.html
Normal file
@@ -0,0 +1,17 @@
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
|
||||
<title>The quick brown fox jumps over the lazy dog</title>
|
||||
<meta name="author" content="Nevin Nollop">
|
||||
<meta name="keywords" content="Pangram, fox, dog">
|
||||
<meta name="description" content="Gym class featuring a brown fox and lazy dog">
|
||||
</head>
|
||||
|
||||
<body lang=EN-US>
|
||||
|
||||
The quick brown fox jumps over the lazy dog
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
BIN
engines/tika/src/test/resources/quick.jar
Normal file
BIN
engines/tika/src/test/resources/quick.jar
Normal file
Binary file not shown.
31
engines/tika/src/test/resources/quick.java
Normal file
31
engines/tika/src/test/resources/quick.java
Normal file
@@ -0,0 +1,31 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
|
||||
public class quick
|
||||
{
|
||||
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.jpg
Normal file
BIN
engines/tika/src/test/resources/quick.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
6
engines/tika/src/test/resources/quick.jpg_metadata.json
Normal file
6
engines/tika/src/test/resources/quick.jpg_metadata.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/exif/1.0}pixelYDimension" : "92",
|
||||
"{http://www.alfresco.org/model/exif/1.0}pixelXDimension" : "409",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.key
Normal file
BIN
engines/tika/src/test/resources/quick.key
Normal file
Binary file not shown.
4
engines/tika/src/test/resources/quick.key_metadata.json
Normal file
4
engines/tika/src/test/resources/quick.key_metadata.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.m4a
Normal file
BIN
engines/tika/src/test/resources/quick.m4a
Normal file
Binary file not shown.
13
engines/tika/src/test/resources/quick.m4a_metadata.json
Normal file
13
engines/tika/src/test/resources/quick.m4a_metadata.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/audio/1.0}compressor" : "M4A",
|
||||
"{http://www.alfresco.org/model/audio/1.0}artist" : "Hauskaz",
|
||||
"{http://www.alfresco.org/model/audio/1.0}genre" : "Foxtrot",
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : "The quick brown fox jumps over the lazy dog - About a dog and a fox (Hauskaz)",
|
||||
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : 1230768000000,
|
||||
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Stereo",
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : 1230768000000,
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "44100",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : "Hauskaz",
|
||||
"{http://www.alfresco.org/model/audio/1.0}album" : "About a dog and a fox",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog"
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.m4v
Normal file
BIN
engines/tika/src/test/resources/quick.m4v
Normal file
Binary file not shown.
9
engines/tika/src/test/resources/quick.m4v_metadata.json
Normal file
9
engines/tika/src/test/resources/quick.m4v_metadata.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "22050",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Stereo"
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.mov
Normal file
BIN
engines/tika/src/test/resources/quick.mov
Normal file
Binary file not shown.
9
engines/tika/src/test/resources/quick.mov_metadata.json
Normal file
9
engines/tika/src/test/resources/quick.mov_metadata.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "22050",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Mono"
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.mp3
Normal file
BIN
engines/tika/src/test/resources/quick.mp3
Normal file
Binary file not shown.
13
engines/tika/src/test/resources/quick.mp3_metadata.json
Normal file
13
engines/tika/src/test/resources/quick.mp3_metadata.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/audio/1.0}compressor" : "MP3",
|
||||
"{http://www.alfresco.org/model/audio/1.0}artist" : "Hauskaz",
|
||||
"{http://www.alfresco.org/model/audio/1.0}genre" : "Foxtrot",
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : "The quick brown fox jumps over the lazy dog - About a dog and a fox (Hauskaz)",
|
||||
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : 1230768000000,
|
||||
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Stereo",
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : 1230768000000,
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "44100",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : "Hauskaz",
|
||||
"{http://www.alfresco.org/model/audio/1.0}album" : "About a dog and a fox",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog"
|
||||
}
|
BIN
engines/tika/src/test/resources/quick.mp4
Normal file
BIN
engines/tika/src/test/resources/quick.mp4
Normal file
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user