headers)
{
- putRawValue(KEY_ORIGINATOR, metadata.get(Metadata.AUTHOR), properties);
- putRawValue(KEY_SUBJECT, metadata.get(Metadata.TITLE), properties);
- putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.SUBJECT), properties);
- putRawValue(KEY_SENT_DATE, metadata.get(Metadata.LAST_SAVED), properties);
+ putRawValue(KEY_ORIGINATOR, metadata.get(TikaCoreProperties.CREATOR), properties);
+ putRawValue(KEY_SUBJECT, metadata.get(TikaCoreProperties.TITLE), properties);
+ putRawValue(KEY_DESCRIPTION, metadata.get(TikaCoreProperties.SUBJECT), properties);
+ putRawValue(KEY_SENT_DATE, metadata.get(TikaCoreProperties.MODIFIED), properties);
// Store the TO, but not cc/bcc in the addressee field
- putRawValue(KEY_ADDRESSEE, metadata.get(Metadata.MESSAGE_TO), properties);
+ putRawValue(KEY_ADDRESSEE, metadata.get(Message.MESSAGE_TO), properties);
// Store each of To, CC and BCC in their own fields
- putRawValue(KEY_TO_NAMES, metadata.getValues(Metadata.MESSAGE_TO), properties);
- putRawValue(KEY_CC_NAMES, metadata.getValues(Metadata.MESSAGE_CC), properties);
- putRawValue(KEY_BCC_NAMES, metadata.getValues(Metadata.MESSAGE_BCC), properties);
+ putRawValue(KEY_TO_NAMES, metadata.getValues(Message.MESSAGE_TO), properties);
+ putRawValue(KEY_CC_NAMES, metadata.getValues(Message.MESSAGE_CC), properties);
+ putRawValue(KEY_BCC_NAMES, metadata.getValues(Message.MESSAGE_BCC), properties);
// But store all email addresses (to/cc/bcc) in the addresses field
- putRawValue(KEY_ADDRESSEES, metadata.getValues(Metadata.MESSAGE_RECIPIENT_ADDRESS), properties);
+ putRawValue(KEY_ADDRESSEES, metadata.getValues(Message.MESSAGE_RECIPIENT_ADDRESS), properties);
return properties;
}
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/OfficeMetadataExtractor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/OfficeMetadataExtractor.java
index dfca577e..7612a386 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/OfficeMetadataExtractor.java
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/OfficeMetadataExtractor.java
@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
- * Copyright (C) 2005 - 2020 Alfresco Software Limited
+ * Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -27,6 +27,8 @@
package org.alfresco.transformer.metadataExtractors;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.slf4j.Logger;
@@ -40,7 +42,7 @@ import java.util.Map;
*
* Configuration: (see OfficeMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
*
- * This extracter uses the POI library to extract the following:
+ * This extractor uses the POI library to extract the following:
*
* author: -- cm:author
* title: -- cm:title
@@ -91,23 +93,20 @@ public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractor
return new OfficeParser();
}
- @SuppressWarnings("deprecation")
@Override
protected Map extractSpecific(Metadata metadata,
Map properties, Map headers)
{
- putRawValue(KEY_CREATE_DATETIME, metadata.get(Metadata.CREATION_DATE), properties);
- putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(Metadata.LAST_SAVED), properties);
- putRawValue(KEY_EDIT_TIME, metadata.get(Metadata.EDIT_TIME), properties);
- putRawValue(KEY_FORMAT, metadata.get(Metadata.FORMAT), properties);
- putRawValue(KEY_KEYWORDS, metadata.get(Metadata.KEYWORDS), properties);
- putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties);
- putRawValue(KEY_LAST_PRINTED, metadata.get(Metadata.LAST_PRINTED), properties);
-// putRawValue(KEY_OS_VERSION, metadata.get(Metadata.OS_VERSION), properties);
-// putRawValue(KEY_THUMBNAIL, metadata.get(Metadata.THUMBNAIL), properties);
- putRawValue(KEY_PAGE_COUNT, metadata.get(Metadata.PAGE_COUNT), properties);
- putRawValue(KEY_PARAGRAPH_COUNT, metadata.get(Metadata.PARAGRAPH_COUNT), properties);
- putRawValue(KEY_WORD_COUNT, metadata.get(Metadata.WORD_COUNT), properties);
+ putRawValue(KEY_CREATE_DATETIME, metadata.get(TikaCoreProperties.CREATED), properties);
+ putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(TikaCoreProperties.MODIFIED), properties);
+ putRawValue(KEY_EDIT_TIME, metadata.get(TikaCoreProperties.MODIFIED), properties);
+ putRawValue(KEY_FORMAT, metadata.get(TikaCoreProperties.FORMAT), properties);
+ putRawValue(KEY_KEYWORDS, metadata.get(TikaCoreProperties.SUBJECT), properties);
+ putRawValue(KEY_LAST_AUTHOR, metadata.get(TikaCoreProperties.MODIFIER), properties);
+ putRawValue(KEY_LAST_PRINTED, metadata.get(TikaCoreProperties.PRINT_DATE), properties);
+ putRawValue(KEY_PAGE_COUNT, metadata.get(Office.PAGE_COUNT), properties);
+ putRawValue(KEY_PARAGRAPH_COUNT, metadata.get(Office.PARAGRAPH_COUNT), properties);
+ putRawValue(KEY_WORD_COUNT, metadata.get(Office.WORD_COUNT), properties);
return properties;
}
}
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/OpenDocumentMetadataExtractor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/OpenDocumentMetadataExtractor.java
index 4de536da..8014802b 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/OpenDocumentMetadataExtractor.java
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/OpenDocumentMetadataExtractor.java
@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
- * Copyright (C) 2005 - 2020 Alfresco Software Limited
+ * Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -26,18 +26,28 @@
*/
package org.alfresco.transformer.metadataExtractors;
+import static org.apache.tika.metadata.DublinCore.NAMESPACE_URI_DC;
+
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.odf.OpenDocumentMetaParser;
import org.apache.tika.parser.odf.OpenDocumentParser;
+import org.apache.tika.parser.xml.ElementMetadataHandler;
+import org.apache.tika.sax.TeeContentHandler;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
import java.io.Serializable;
import java.util.Date;
+import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.stream.Collectors;
/**
* {@code "application/vnd.oasis.opendocument..."} and {@code "applicationvnd.oasis.opendocument..."} metadata extractor.
@@ -77,6 +87,7 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
private static final String KEY_INITIAL_CREATOR = "initialCreator";
private static final String KEY_KEYWORD = "keyword";
private static final String KEY_LANGUAGE = "language";
+ private static final String KEY_ALFRESCO_CREATOR = "_alfresco:creator";
private static final String CUSTOM_PREFIX = "custom:";
@@ -90,22 +101,33 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
@Override
protected Parser getParser()
{
- return new OpenDocumentParser();
+ OpenDocumentParser parser = new OpenDocumentParser();
+ parser.setMetaParser(new OpenDocumentMetaParser() {
+ @Override
+ protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context)
+ {
+ final ContentHandler superHandler = super.getContentHandler(ch, md, context);
+ final ContentHandler creatorHandler = new ElementMetadataHandler(NAMESPACE_URI_DC, KEY_CREATOR, md, KEY_ALFRESCO_CREATOR);
+ return new TeeContentHandler(superHandler, creatorHandler);
+ }
+ });
+ return parser;
}
- @SuppressWarnings("deprecation")
@Override
protected Map extractSpecific(Metadata metadata,
Map properties, Map headers)
{
- putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(Metadata.CREATION_DATE)), properties);
- putRawValue(KEY_CREATOR, metadata.get(Metadata.CREATOR), properties);
- putRawValue(KEY_DATE, getDateOrNull(metadata.get(Metadata.DATE)), properties);
- putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.DESCRIPTION), properties);
+ putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(TikaCoreProperties.CREATED)), properties);
+ final String creator = getCreator(metadata);
+ putRawValue(KEY_CREATOR, creator, properties);
+ putRawValue(KEY_AUTHOR, creator, properties);
+ putRawValue(KEY_DATE, getDateOrNull(metadata.get(TikaCoreProperties.MODIFIED)), properties);
+ putRawValue(KEY_DESCRIPTION, metadata.get(TikaCoreProperties.DESCRIPTION), properties);
putRawValue(KEY_GENERATOR, metadata.get("generator"), properties);
putRawValue(KEY_INITIAL_CREATOR, metadata.get("initial-creator"), properties);
- putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties);
- putRawValue(KEY_LANGUAGE, metadata.get(Metadata.LANGUAGE), properties);
+ putRawValue(KEY_KEYWORD, metadata.get(TikaCoreProperties.SUBJECT), properties);
+ putRawValue(KEY_LANGUAGE, metadata.get(TikaCoreProperties.LANGUAGE), properties);
// Handle user-defined properties dynamically
Map> mapping = super.getExtractMapping();
@@ -120,6 +142,18 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
return properties;
}
+ private String getCreator(Metadata metadata)
+ {
+ final List creators = distinct(metadata.getValues(TikaCoreProperties.CREATOR))
+ .collect(Collectors.toUnmodifiableList());
+ if (creators.size() == 1)
+ {
+ return creators.get(0);
+ }
+
+ return metadata.get(KEY_ALFRESCO_CREATOR);
+ }
+
private Date getDateOrNull(String dateString)
{
if (dateString != null && dateString.length() != 0)
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/TikaAudioMetadataExtractor.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/TikaAudioMetadataExtractor.java
index 1a8a4a84..e7933ef3 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/TikaAudioMetadataExtractor.java
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/TikaAudioMetadataExtractor.java
@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
- * Copyright (C) 2005 - 2020 Alfresco Software Limited
+ * Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -28,6 +28,7 @@ package org.alfresco.transformer.metadataExtractors;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
@@ -148,13 +149,12 @@ public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractor
* @param metadata the metadata extracted from the file
* @return the description
*/
- @SuppressWarnings("deprecation")
private String generateDescription(Metadata metadata)
{
StringBuilder result = new StringBuilder();
- if (metadata.get(Metadata.TITLE) != null)
+ if (metadata.get(TikaCoreProperties.TITLE) != null)
{
- result.append(metadata.get(Metadata.TITLE));
+ result.append(metadata.get(TikaCoreProperties.TITLE));
if (metadata.get(XMPDM.ALBUM) != null)
{
result
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java
index e43677a4..9e15731e 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java
@@ -44,9 +44,9 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.output.NullOutputStream;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.io.NullOutputStream;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -57,7 +57,7 @@ import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.external.ExternalParsersFactory;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.image.TiffParser;
-import org.apache.tika.parser.jpeg.JpegParser;
+import org.apache.tika.parser.image.JpegParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -263,7 +263,7 @@ public class ExifToolParser extends ExternalParser {
* stream of the given process to the given XHTML content handler.
* The standard output stream is closed once fully processed.
*
- * @param process process
+ * @param stream stream
* @param xhtml XHTML content handler
* @throws SAXException if the XHTML SAX events could not be handled
* @throws IOException if an input error occurred
@@ -315,13 +315,13 @@ public class ExifToolParser extends ExternalParser {
* standard stream of the given process. Potential exceptions
* are ignored, and the stream is closed once fully processed.
*
- * @param process process
+ * @param stream stream
*/
private void ignoreStream(final InputStream stream) {
Thread t = new Thread() {
public void run() {
try {
- IOUtils.copy(stream, new NullOutputStream());
+ IOUtils.copy(stream, NullOutputStream.NULL_OUTPUT_STREAM);
} catch (IOException e) {
} finally {
IOUtils.closeQuietly(stream);
diff --git a/pom.xml b/pom.xml
index e0916686..edef853b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -27,7 +27,7 @@
${dependency.jackson.version}
4.13.2
3.5.0
- 1.26
+ 2.1.0
4.1.2
1.4