Restore ATS-969 Tika upgrade 1.x -> 2.x (#493)

This reverts commit 9776577a452444dad634117d349635604fa9a9a8.

Was not possible to perform the release of 2.5.5-A1 with this upgrade of Tika.
Possibly related to it forcing a change in the following files, which were then deleted in the build:
D	alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/license/THIRD-PARTY.properties
D	alfresco-transform-core-aio/alfresco-transform-core-aio/src/license/THIRD-PARTY.properties
D	alfresco-transform-tika/alfresco-transform-tika-boot/src/license/THIRD-PARTY.properties
D	alfresco-transform-tika/alfresco-transform-tika/src/license/THIRD-PARTY.properties
This commit is contained in:
alandavis 2022-01-05 21:56:41 +00:00
parent 2fd8361a78
commit 1cd673de63
16 changed files with 135 additions and 108 deletions

View File

@ -65,13 +65,9 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${dependency.tika.version}</version>
<exclusions>
<exclusion>
<groupId>com.tdunning</groupId>
<artifactId>json</artifactId>
</exclusion>
<exclusion>
<groupId>org.bouncycastle</groupId>
<artifactId>bcprov-jdk15on</artifactId>
@ -80,10 +76,9 @@
<groupId>org.bouncycastle</groupId>
<artifactId>bcmail-jdk15on</artifactId>
</exclusion>
<!-- TODO ATS-534 check transformations not affected by this missing quartz lib -->
<exclusion>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
<groupId>xml-apis</groupId>
<artifactId>xml-apis</artifactId>
</exclusion>
</exclusions>
</dependency>

View File

@ -2,7 +2,8 @@
"{http://www.alfresco.org/model/content/1.0}description" : null,
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
"{http://www.alfresco.org/model/content/1.0}created" : null,
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "1000",
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "8000",
"{http://www.alfresco.org/model/content/1.0}author" : null,
"{http://www.alfresco.org/model/content/1.0}title" : null
"{http://www.alfresco.org/model/content/1.0}title" : null,
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Stereo"
}

View File

@ -2,7 +2,8 @@
"{http://www.alfresco.org/model/content/1.0}description" : null,
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
"{http://www.alfresco.org/model/content/1.0}created" : null,
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "1000",
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "8000",
"{http://www.alfresco.org/model/content/1.0}author" : null,
"{http://www.alfresco.org/model/content/1.0}title" : null
"{http://www.alfresco.org/model/content/1.0}title" : null,
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Stereo"
}

View File

@ -2,7 +2,8 @@
"{http://www.alfresco.org/model/content/1.0}description" : null,
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
"{http://www.alfresco.org/model/content/1.0}created" : null,
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "1000",
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "22050",
"{http://www.alfresco.org/model/content/1.0}author" : null,
"{http://www.alfresco.org/model/content/1.0}title" : null
"{http://www.alfresco.org/model/content/1.0}title" : null,
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Stereo"
}

View File

@ -2,7 +2,8 @@
"{http://www.alfresco.org/model/content/1.0}description" : null,
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
"{http://www.alfresco.org/model/content/1.0}created" : null,
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "1000",
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "22050",
"{http://www.alfresco.org/model/content/1.0}author" : null,
"{http://www.alfresco.org/model/content/1.0}title" : null
"{http://www.alfresco.org/model/content/1.0}title" : null,
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Mono"
}

View File

@ -2,7 +2,8 @@
"{http://www.alfresco.org/model/content/1.0}description" : null,
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
"{http://www.alfresco.org/model/content/1.0}created" : null,
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "90000",
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "22050",
"{http://www.alfresco.org/model/content/1.0}author" : null,
"{http://www.alfresco.org/model/content/1.0}title" : null
"{http://www.alfresco.org/model/content/1.0}title" : null,
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Mono"
}

View File

@ -27,13 +27,9 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${dependency.tika.version}</version>
<exclusions>
<exclusion>
<groupId>com.tdunning</groupId>
<artifactId>json</artifactId>
</exclusion>
<exclusion>
<groupId>org.bouncycastle</groupId>
<artifactId>bcprov-jdk15on</artifactId>
@ -42,11 +38,6 @@
<groupId>org.bouncycastle</groupId>
<artifactId>bcmail-jdk15on</artifactId>
</exclusion>
<!-- TODO ATS-534 check transformations not affected by this missing quartz lib -->
<exclusion>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
</exclusion>
<exclusion>
<groupId>xml-apis</groupId>
<artifactId>xml-apis</artifactId>

View File

@ -28,7 +28,11 @@ package org.alfresco.transformer.metadataExtractors;
import org.apache.tika.embedder.Embedder;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.ContentHandlerDecorator;
@ -58,7 +62,10 @@ import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* The parent of all Metadata Extractors which use Apache Tika under the hood. This handles all the
@ -83,7 +90,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
protected static final String KEY_CREATED = "created";
protected static final String KEY_DESCRIPTION = "description";
protected static final String KEY_COMMENTS = "comments";
protected static final String KEY_TAGS = "dc:subject";
protected static final String KEY_TAGS = DublinCore.SUBJECT.getName();
private static final String METADATA_SEPARATOR = ",";
@ -208,7 +215,6 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
}
@Override
@SuppressWarnings( "deprecation" )
public Map<String, Serializable> extractMetadata(String sourceMimetype, Map<String, String> transformOptions,
File sourceFile) throws Exception
{
@ -245,7 +251,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
for (String tikaKey : metadata.names())
{
// TODO review this change (part of MNT-15267) - should we really force string concatenation here !?
putRawValue(tikaKey, getMetadataValue(metadata, tikaKey), rawProperties);
putRawValue(tikaKey, getMetadataValue(metadata, Property.internalText(tikaKey)), rawProperties);
}
// Now, map the common Tika metadata keys onto
@ -254,17 +260,17 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
// to work without needing any changes
// The simple ones
putRawValue(KEY_AUTHOR, getMetadataValue(metadata, Metadata.AUTHOR), rawProperties);
putRawValue(KEY_TITLE, getMetadataValue(metadata, Metadata.TITLE), rawProperties);
putRawValue(KEY_COMMENTS, getMetadataValue(metadata, Metadata.COMMENTS), rawProperties);
putRawValue(KEY_AUTHOR, getMetadataValue(metadata, TikaCoreProperties.CREATOR), rawProperties);
putRawValue(KEY_TITLE, getMetadataValue(metadata, TikaCoreProperties.TITLE), rawProperties);
putRawValue(KEY_COMMENTS, getMetadataValue(metadata, TikaCoreProperties.COMMENTS), rawProperties);
// Tags
putRawValue(KEY_TAGS, getMetadataValues(metadata, KEY_TAGS), rawProperties);
// Get the subject and description, despite things not
// being nearly as consistent as one might hope
String subject = getMetadataValue(metadata, Metadata.SUBJECT);
String description = getMetadataValue(metadata, Metadata.DESCRIPTION);
String subject = getMetadataValue(metadata, OfficeOpenXMLCore.SUBJECT);
String description = getMetadataValue(metadata, TikaCoreProperties.DESCRIPTION);
if(subject != null && description != null)
{
putRawValue(KEY_DESCRIPTION, description, rawProperties);
@ -282,13 +288,13 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
}
// Try for the dates two different ways too
if(metadata.get(Metadata.CREATION_DATE) != null)
if(metadata.get(TikaCoreProperties.CREATED) != null)
{
putRawValue(KEY_CREATED, metadata.get(Metadata.CREATION_DATE), rawProperties);
putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.CREATED), rawProperties);
}
else if(metadata.get(Metadata.DATE) != null)
else if(metadata.get(TikaCoreProperties.MODIFIED) != null)
{
putRawValue(KEY_CREATED, metadata.get(Metadata.DATE), rawProperties);
putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.MODIFIED), rawProperties);
}
// If people created a specific instance
@ -388,24 +394,11 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
return values.length == 0 ? null : (values.length == 1 ? values[0] : values);
}
private String getMetadataValue(Metadata metadata, String key)
private String getMetadataValue(Metadata metadata, Property key)
{
if (metadata.isMultiValued(key))
{
String[] parts = metadata.getValues(key);
// use Set to prevent duplicates
Set<String> value = new LinkedHashSet<>(parts.length);
for (int i = 0; i < parts.length; i++)
{
value.add(parts[i]);
}
String valueStr = value.toString();
// remove leading/trailing braces []
return valueStr.substring(1, valueStr.length() - 1);
return distinct(metadata.getValues(key)).collect(Collectors.joining(", "));
}
else
{
@ -413,6 +406,15 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
}
}
protected static Stream<String> distinct(final String[] strings)
{
return Stream.of(strings)
.filter(Objects::nonNull)
.map(String::strip)
.filter(s -> !s.isEmpty())
.distinct();
}
/**
* This content handler will capture entries from within
* the header of the Tika content XHTML, but ignore the

View File

@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@ -27,6 +27,7 @@
package org.alfresco.transformer.metadataExtractors;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.dwg.DWGParser;
import org.slf4j.Logger;
@ -64,13 +65,12 @@ public class DWGMetadataExtractor extends AbstractTikaMetadataExtractor
super(logger);
}
@SuppressWarnings("deprecation")
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
Map<String, Serializable> properties, Map<String,String> headers)
{
putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties);
putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties);
putRawValue(KEY_KEYWORD, metadata.get(TikaCoreProperties.SUBJECT), properties);
putRawValue(KEY_LAST_AUTHOR, metadata.get(TikaCoreProperties.MODIFIED), properties);
return properties;
}

View File

@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@ -27,6 +27,7 @@
package org.alfresco.transformer.metadataExtractors;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.mp3.Mp3Parser;
@ -86,7 +87,6 @@ public class MP3MetadataExtractor extends TikaAudioMetadataExtractor
return new Mp3Parser();
}
@SuppressWarnings("deprecation")
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
Map<String, Serializable> properties, Map<String,String> headers)
@ -98,7 +98,7 @@ public class MP3MetadataExtractor extends TikaAudioMetadataExtractor
// We only need these for people who had pre-existing mapping
// properties from before the proper audio model was added
putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties);
putRawValue(KEY_SONG_TITLE, metadata.get(TikaCoreProperties.TITLE), properties);
putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties);
putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties);

View File

@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@ -26,7 +26,9 @@
*/
package org.alfresco.transformer.metadataExtractors;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.slf4j.Logger;
@ -82,26 +84,25 @@ public class MailMetadataExtractor extends AbstractTikaMetadataExtractor
return new OfficeParser();
}
@SuppressWarnings("deprecation")
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
Map<String, Serializable> properties, Map<String,String> headers)
{
putRawValue(KEY_ORIGINATOR, metadata.get(Metadata.AUTHOR), properties);
putRawValue(KEY_SUBJECT, metadata.get(Metadata.TITLE), properties);
putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.SUBJECT), properties);
putRawValue(KEY_SENT_DATE, metadata.get(Metadata.LAST_SAVED), properties);
putRawValue(KEY_ORIGINATOR, metadata.get(TikaCoreProperties.CREATOR), properties);
putRawValue(KEY_SUBJECT, metadata.get(TikaCoreProperties.TITLE), properties);
putRawValue(KEY_DESCRIPTION, metadata.get(TikaCoreProperties.SUBJECT), properties);
putRawValue(KEY_SENT_DATE, metadata.get(TikaCoreProperties.MODIFIED), properties);
// Store the TO, but not cc/bcc in the addressee field
putRawValue(KEY_ADDRESSEE, metadata.get(Metadata.MESSAGE_TO), properties);
putRawValue(KEY_ADDRESSEE, metadata.get(Message.MESSAGE_TO), properties);
// Store each of To, CC and BCC in their own fields
putRawValue(KEY_TO_NAMES, metadata.getValues(Metadata.MESSAGE_TO), properties);
putRawValue(KEY_CC_NAMES, metadata.getValues(Metadata.MESSAGE_CC), properties);
putRawValue(KEY_BCC_NAMES, metadata.getValues(Metadata.MESSAGE_BCC), properties);
putRawValue(KEY_TO_NAMES, metadata.getValues(Message.MESSAGE_TO), properties);
putRawValue(KEY_CC_NAMES, metadata.getValues(Message.MESSAGE_CC), properties);
putRawValue(KEY_BCC_NAMES, metadata.getValues(Message.MESSAGE_BCC), properties);
// But store all email addresses (to/cc/bcc) in the addresses field
putRawValue(KEY_ADDRESSEES, metadata.getValues(Metadata.MESSAGE_RECIPIENT_ADDRESS), properties);
putRawValue(KEY_ADDRESSEES, metadata.getValues(Message.MESSAGE_RECIPIENT_ADDRESS), properties);
return properties;
}

View File

@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@ -27,6 +27,8 @@
package org.alfresco.transformer.metadataExtractors;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.slf4j.Logger;
@ -40,7 +42,7 @@ import java.util.Map;
*
* Configuration: (see OfficeMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
*
* This extracter uses the POI library to extract the following:
* This extractor uses the POI library to extract the following:
* <pre>
* <b>author:</b> -- cm:author
* <b>title:</b> -- cm:title
@ -91,23 +93,20 @@ public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractor
return new OfficeParser();
}
@SuppressWarnings("deprecation")
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
Map<String, Serializable> properties, Map<String,String> headers)
{
putRawValue(KEY_CREATE_DATETIME, metadata.get(Metadata.CREATION_DATE), properties);
putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(Metadata.LAST_SAVED), properties);
putRawValue(KEY_EDIT_TIME, metadata.get(Metadata.EDIT_TIME), properties);
putRawValue(KEY_FORMAT, metadata.get(Metadata.FORMAT), properties);
putRawValue(KEY_KEYWORDS, metadata.get(Metadata.KEYWORDS), properties);
putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties);
putRawValue(KEY_LAST_PRINTED, metadata.get(Metadata.LAST_PRINTED), properties);
// putRawValue(KEY_OS_VERSION, metadata.get(Metadata.OS_VERSION), properties);
// putRawValue(KEY_THUMBNAIL, metadata.get(Metadata.THUMBNAIL), properties);
putRawValue(KEY_PAGE_COUNT, metadata.get(Metadata.PAGE_COUNT), properties);
putRawValue(KEY_PARAGRAPH_COUNT, metadata.get(Metadata.PARAGRAPH_COUNT), properties);
putRawValue(KEY_WORD_COUNT, metadata.get(Metadata.WORD_COUNT), properties);
putRawValue(KEY_CREATE_DATETIME, metadata.get(TikaCoreProperties.CREATED), properties);
putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(TikaCoreProperties.MODIFIED), properties);
putRawValue(KEY_EDIT_TIME, metadata.get(TikaCoreProperties.MODIFIED), properties);
putRawValue(KEY_FORMAT, metadata.get(TikaCoreProperties.FORMAT), properties);
putRawValue(KEY_KEYWORDS, metadata.get(TikaCoreProperties.SUBJECT), properties);
putRawValue(KEY_LAST_AUTHOR, metadata.get(TikaCoreProperties.MODIFIER), properties);
putRawValue(KEY_LAST_PRINTED, metadata.get(TikaCoreProperties.PRINT_DATE), properties);
putRawValue(KEY_PAGE_COUNT, metadata.get(Office.PAGE_COUNT), properties);
putRawValue(KEY_PARAGRAPH_COUNT, metadata.get(Office.PARAGRAPH_COUNT), properties);
putRawValue(KEY_WORD_COUNT, metadata.get(Office.WORD_COUNT), properties);
return properties;
}
}

View File

@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@ -26,18 +26,28 @@
*/
package org.alfresco.transformer.metadataExtractors;
import static org.apache.tika.metadata.DublinCore.NAMESPACE_URI_DC;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.odf.OpenDocumentMetaParser;
import org.apache.tika.parser.odf.OpenDocumentParser;
import org.apache.tika.parser.xml.ElementMetadataHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import java.io.Serializable;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
/**
* {@code "application/vnd.oasis.opendocument..."} and {@code "applicationvnd.oasis.opendocument..."} metadata extractor.
@ -77,6 +87,7 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
private static final String KEY_INITIAL_CREATOR = "initialCreator";
private static final String KEY_KEYWORD = "keyword";
private static final String KEY_LANGUAGE = "language";
private static final String KEY_ALFRESCO_CREATOR = "_alfresco:creator";
private static final String CUSTOM_PREFIX = "custom:";
@ -90,22 +101,33 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
@Override
protected Parser getParser()
{
return new OpenDocumentParser();
OpenDocumentParser parser = new OpenDocumentParser();
parser.setMetaParser(new OpenDocumentMetaParser() {
@Override
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context)
{
final ContentHandler superHandler = super.getContentHandler(ch, md, context);
final ContentHandler creatorHandler = new ElementMetadataHandler(NAMESPACE_URI_DC, KEY_CREATOR, md, KEY_ALFRESCO_CREATOR);
return new TeeContentHandler(superHandler, creatorHandler);
}
});
return parser;
}
@SuppressWarnings("deprecation")
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
Map<String, Serializable> properties, Map<String, String> headers)
{
putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(Metadata.CREATION_DATE)), properties);
putRawValue(KEY_CREATOR, metadata.get(Metadata.CREATOR), properties);
putRawValue(KEY_DATE, getDateOrNull(metadata.get(Metadata.DATE)), properties);
putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.DESCRIPTION), properties);
putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(TikaCoreProperties.CREATED)), properties);
final String creator = getCreator(metadata);
putRawValue(KEY_CREATOR, creator, properties);
putRawValue(KEY_AUTHOR, creator, properties);
putRawValue(KEY_DATE, getDateOrNull(metadata.get(TikaCoreProperties.MODIFIED)), properties);
putRawValue(KEY_DESCRIPTION, metadata.get(TikaCoreProperties.DESCRIPTION), properties);
putRawValue(KEY_GENERATOR, metadata.get("generator"), properties);
putRawValue(KEY_INITIAL_CREATOR, metadata.get("initial-creator"), properties);
putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties);
putRawValue(KEY_LANGUAGE, metadata.get(Metadata.LANGUAGE), properties);
putRawValue(KEY_KEYWORD, metadata.get(TikaCoreProperties.SUBJECT), properties);
putRawValue(KEY_LANGUAGE, metadata.get(TikaCoreProperties.LANGUAGE), properties);
// Handle user-defined properties dynamically
Map<String, Set<String>> mapping = super.getExtractMapping();
@ -120,6 +142,18 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
return properties;
}
private String getCreator(Metadata metadata)
{
final List<String> creators = distinct(metadata.getValues(TikaCoreProperties.CREATOR))
.collect(Collectors.toUnmodifiableList());
if (creators.size() == 1)
{
return creators.get(0);
}
return metadata.get(KEY_ALFRESCO_CREATOR);
}
private Date getDateOrNull(String dateString)
{
if (dateString != null && dateString.length() != 0)

View File

@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@ -28,6 +28,7 @@ package org.alfresco.transformer.metadataExtractors;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
@ -148,13 +149,12 @@ public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractor
* @param metadata the metadata extracted from the file
* @return the description
*/
@SuppressWarnings("deprecation")
private String generateDescription(Metadata metadata)
{
StringBuilder result = new StringBuilder();
if (metadata.get(Metadata.TITLE) != null)
if (metadata.get(TikaCoreProperties.TITLE) != null)
{
result.append(metadata.get(Metadata.TITLE));
result.append(metadata.get(TikaCoreProperties.TITLE));
if (metadata.get(XMPDM.ALBUM) != null)
{
result

View File

@ -44,9 +44,9 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.NullOutputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.apache.tika.io.NullOutputStream;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@ -57,7 +57,7 @@ import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.external.ExternalParsersFactory;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.parser.jpeg.JpegParser;
import org.apache.tika.parser.image.JpegParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -263,7 +263,7 @@ public class ExifToolParser extends ExternalParser {
* stream of the given process to the given XHTML content handler.
* The standard output stream is closed once fully processed.
*
* @param process process
* @param stream stream
* @param xhtml XHTML content handler
* @throws SAXException if the XHTML SAX events could not be handled
* @throws IOException if an input error occurred
@ -315,13 +315,13 @@ public class ExifToolParser extends ExternalParser {
* standard stream of the given process. Potential exceptions
* are ignored, and the stream is closed once fully processed.
*
* @param process process
* @param stream stream
*/
private void ignoreStream(final InputStream stream) {
Thread t = new Thread() {
public void run() {
try {
IOUtils.copy(stream, new NullOutputStream());
IOUtils.copy(stream, NullOutputStream.NULL_OUTPUT_STREAM);
} catch (IOException e) {
} finally {
IOUtils.closeQuietly(stream);

View File

@ -27,7 +27,7 @@
<dependency.jackson-databind.version>${dependency.jackson.version}</dependency.jackson-databind.version>
<dependency.junit.version>4.13.2</dependency.junit.version>
<dependency.cxf.version>3.5.0</dependency.cxf.version>
<dependency.tika.version>1.26</dependency.tika.version>
<dependency.tika.version>2.1.0</dependency.tika.version>
<dependency.poi.version>4.1.2</dependency.poi.version>
<dependency.ooxml-schemas.version>1.4</dependency.ooxml-schemas.version>