mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-05-12 17:04:48 +00:00
Restore ATS-969 Tika upgrade 1.x -> 2.x (#493)
This reverts commit 9776577a452444dad634117d349635604fa9a9a8. Was not possible to perform the release of 2.5.5-A1 with this upgrade of Tika. Possibly related to it forcing a change in the following files, which were then deleted in the build: D alfresco-transform-core-aio/alfresco-transform-core-aio-boot/src/license/THIRD-PARTY.properties D alfresco-transform-core-aio/alfresco-transform-core-aio/src/license/THIRD-PARTY.properties D alfresco-transform-tika/alfresco-transform-tika-boot/src/license/THIRD-PARTY.properties D alfresco-transform-tika/alfresco-transform-tika/src/license/THIRD-PARTY.properties
This commit is contained in:
parent
2fd8361a78
commit
1cd673de63
@ -65,13 +65,9 @@
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-parsers</artifactId>
|
||||
<artifactId>tika-parsers-standard-package</artifactId>
|
||||
<version>${dependency.tika.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>com.tdunning</groupId>
|
||||
<artifactId>json</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.bouncycastle</groupId>
|
||||
<artifactId>bcprov-jdk15on</artifactId>
|
||||
@ -80,10 +76,9 @@
|
||||
<groupId>org.bouncycastle</groupId>
|
||||
<artifactId>bcmail-jdk15on</artifactId>
|
||||
</exclusion>
|
||||
<!-- TODO ATS-534 check transformations not affected by this missing quartz lib -->
|
||||
<exclusion>
|
||||
<groupId>org.quartz-scheduler</groupId>
|
||||
<artifactId>quartz</artifactId>
|
||||
<groupId>xml-apis</groupId>
|
||||
<artifactId>xml-apis</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
@ -2,7 +2,8 @@
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "1000",
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "8000",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Stereo"
|
||||
}
|
@ -2,7 +2,8 @@
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "1000",
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "8000",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Stereo"
|
||||
}
|
@ -2,7 +2,8 @@
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "1000",
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "22050",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Stereo"
|
||||
}
|
@ -2,7 +2,8 @@
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "1000",
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "22050",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Mono"
|
||||
}
|
@ -2,7 +2,8 @@
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}releaseDate" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}created" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "90000",
|
||||
"{http://www.alfresco.org/model/audio/1.0}sampleRate" : "22050",
|
||||
"{http://www.alfresco.org/model/content/1.0}author" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : null,
|
||||
"{http://www.alfresco.org/model/audio/1.0}channelType" : "Mono"
|
||||
}
|
@ -27,13 +27,9 @@
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-parsers</artifactId>
|
||||
<artifactId>tika-parsers-standard-package</artifactId>
|
||||
<version>${dependency.tika.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>com.tdunning</groupId>
|
||||
<artifactId>json</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.bouncycastle</groupId>
|
||||
<artifactId>bcprov-jdk15on</artifactId>
|
||||
@ -42,11 +38,6 @@
|
||||
<groupId>org.bouncycastle</groupId>
|
||||
<artifactId>bcmail-jdk15on</artifactId>
|
||||
</exclusion>
|
||||
<!-- TODO ATS-534 check transformations not affected by this missing quartz lib -->
|
||||
<exclusion>
|
||||
<groupId>org.quartz-scheduler</groupId>
|
||||
<artifactId>quartz</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>xml-apis</groupId>
|
||||
<artifactId>xml-apis</artifactId>
|
||||
|
@ -28,7 +28,11 @@ package org.alfresco.transformer.metadataExtractors;
|
||||
|
||||
import org.apache.tika.embedder.Embedder;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.DublinCore;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.OfficeOpenXMLCore;
|
||||
import org.apache.tika.metadata.Property;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.sax.ContentHandlerDecorator;
|
||||
@ -58,7 +62,10 @@ import java.util.HashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* The parent of all Metadata Extractors which use Apache Tika under the hood. This handles all the
|
||||
@ -83,7 +90,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
protected static final String KEY_CREATED = "created";
|
||||
protected static final String KEY_DESCRIPTION = "description";
|
||||
protected static final String KEY_COMMENTS = "comments";
|
||||
protected static final String KEY_TAGS = "dc:subject";
|
||||
protected static final String KEY_TAGS = DublinCore.SUBJECT.getName();
|
||||
|
||||
private static final String METADATA_SEPARATOR = ",";
|
||||
|
||||
@ -208,7 +215,6 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
}
|
||||
|
||||
@Override
|
||||
@SuppressWarnings( "deprecation" )
|
||||
public Map<String, Serializable> extractMetadata(String sourceMimetype, Map<String, String> transformOptions,
|
||||
File sourceFile) throws Exception
|
||||
{
|
||||
@ -245,7 +251,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
for (String tikaKey : metadata.names())
|
||||
{
|
||||
// TODO review this change (part of MNT-15267) - should we really force string concatenation here !?
|
||||
putRawValue(tikaKey, getMetadataValue(metadata, tikaKey), rawProperties);
|
||||
putRawValue(tikaKey, getMetadataValue(metadata, Property.internalText(tikaKey)), rawProperties);
|
||||
}
|
||||
|
||||
// Now, map the common Tika metadata keys onto
|
||||
@ -254,17 +260,17 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
// to work without needing any changes
|
||||
|
||||
// The simple ones
|
||||
putRawValue(KEY_AUTHOR, getMetadataValue(metadata, Metadata.AUTHOR), rawProperties);
|
||||
putRawValue(KEY_TITLE, getMetadataValue(metadata, Metadata.TITLE), rawProperties);
|
||||
putRawValue(KEY_COMMENTS, getMetadataValue(metadata, Metadata.COMMENTS), rawProperties);
|
||||
putRawValue(KEY_AUTHOR, getMetadataValue(metadata, TikaCoreProperties.CREATOR), rawProperties);
|
||||
putRawValue(KEY_TITLE, getMetadataValue(metadata, TikaCoreProperties.TITLE), rawProperties);
|
||||
putRawValue(KEY_COMMENTS, getMetadataValue(metadata, TikaCoreProperties.COMMENTS), rawProperties);
|
||||
|
||||
// Tags
|
||||
putRawValue(KEY_TAGS, getMetadataValues(metadata, KEY_TAGS), rawProperties);
|
||||
|
||||
// Get the subject and description, despite things not
|
||||
// being nearly as consistent as one might hope
|
||||
String subject = getMetadataValue(metadata, Metadata.SUBJECT);
|
||||
String description = getMetadataValue(metadata, Metadata.DESCRIPTION);
|
||||
String subject = getMetadataValue(metadata, OfficeOpenXMLCore.SUBJECT);
|
||||
String description = getMetadataValue(metadata, TikaCoreProperties.DESCRIPTION);
|
||||
if(subject != null && description != null)
|
||||
{
|
||||
putRawValue(KEY_DESCRIPTION, description, rawProperties);
|
||||
@ -282,13 +288,13 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
}
|
||||
|
||||
// Try for the dates two different ways too
|
||||
if(metadata.get(Metadata.CREATION_DATE) != null)
|
||||
if(metadata.get(TikaCoreProperties.CREATED) != null)
|
||||
{
|
||||
putRawValue(KEY_CREATED, metadata.get(Metadata.CREATION_DATE), rawProperties);
|
||||
putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.CREATED), rawProperties);
|
||||
}
|
||||
else if(metadata.get(Metadata.DATE) != null)
|
||||
else if(metadata.get(TikaCoreProperties.MODIFIED) != null)
|
||||
{
|
||||
putRawValue(KEY_CREATED, metadata.get(Metadata.DATE), rawProperties);
|
||||
putRawValue(KEY_CREATED, metadata.get(TikaCoreProperties.MODIFIED), rawProperties);
|
||||
}
|
||||
|
||||
// If people created a specific instance
|
||||
@ -388,24 +394,11 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
return values.length == 0 ? null : (values.length == 1 ? values[0] : values);
|
||||
}
|
||||
|
||||
private String getMetadataValue(Metadata metadata, String key)
|
||||
private String getMetadataValue(Metadata metadata, Property key)
|
||||
{
|
||||
if (metadata.isMultiValued(key))
|
||||
{
|
||||
String[] parts = metadata.getValues(key);
|
||||
|
||||
// use Set to prevent duplicates
|
||||
Set<String> value = new LinkedHashSet<>(parts.length);
|
||||
|
||||
for (int i = 0; i < parts.length; i++)
|
||||
{
|
||||
value.add(parts[i]);
|
||||
}
|
||||
|
||||
String valueStr = value.toString();
|
||||
|
||||
// remove leading/trailing braces []
|
||||
return valueStr.substring(1, valueStr.length() - 1);
|
||||
return distinct(metadata.getValues(key)).collect(Collectors.joining(", "));
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -413,6 +406,15 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
}
|
||||
}
|
||||
|
||||
protected static Stream<String> distinct(final String[] strings)
|
||||
{
|
||||
return Stream.of(strings)
|
||||
.filter(Objects::nonNull)
|
||||
.map(String::strip)
|
||||
.filter(s -> !s.isEmpty())
|
||||
.distinct();
|
||||
}
|
||||
|
||||
/**
|
||||
* This content handler will capture entries from within
|
||||
* the header of the Tika content XHTML, but ignore the
|
||||
|
@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@ -27,6 +27,7 @@
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.dwg.DWGParser;
|
||||
import org.slf4j.Logger;
|
||||
@ -64,13 +65,12 @@ public class DWGMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
super(logger);
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
{
|
||||
putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties);
|
||||
putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties);
|
||||
putRawValue(KEY_KEYWORD, metadata.get(TikaCoreProperties.SUBJECT), properties);
|
||||
putRawValue(KEY_LAST_AUTHOR, metadata.get(TikaCoreProperties.MODIFIED), properties);
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@ -27,6 +27,7 @@
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.metadata.XMPDM;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.mp3.Mp3Parser;
|
||||
@ -86,7 +87,6 @@ public class MP3MetadataExtractor extends TikaAudioMetadataExtractor
|
||||
return new Mp3Parser();
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
@ -98,7 +98,7 @@ public class MP3MetadataExtractor extends TikaAudioMetadataExtractor
|
||||
// We only need these for people who had pre-existing mapping
|
||||
// properties from before the proper audio model was added
|
||||
putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
|
||||
putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties);
|
||||
putRawValue(KEY_SONG_TITLE, metadata.get(TikaCoreProperties.TITLE), properties);
|
||||
putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
|
||||
putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties);
|
||||
putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties);
|
||||
|
@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@ -26,7 +26,9 @@
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Message;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.slf4j.Logger;
|
||||
@ -82,26 +84,25 @@ public class MailMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
return new OfficeParser();
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
{
|
||||
putRawValue(KEY_ORIGINATOR, metadata.get(Metadata.AUTHOR), properties);
|
||||
putRawValue(KEY_SUBJECT, metadata.get(Metadata.TITLE), properties);
|
||||
putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.SUBJECT), properties);
|
||||
putRawValue(KEY_SENT_DATE, metadata.get(Metadata.LAST_SAVED), properties);
|
||||
putRawValue(KEY_ORIGINATOR, metadata.get(TikaCoreProperties.CREATOR), properties);
|
||||
putRawValue(KEY_SUBJECT, metadata.get(TikaCoreProperties.TITLE), properties);
|
||||
putRawValue(KEY_DESCRIPTION, metadata.get(TikaCoreProperties.SUBJECT), properties);
|
||||
putRawValue(KEY_SENT_DATE, metadata.get(TikaCoreProperties.MODIFIED), properties);
|
||||
|
||||
// Store the TO, but not cc/bcc in the addressee field
|
||||
putRawValue(KEY_ADDRESSEE, metadata.get(Metadata.MESSAGE_TO), properties);
|
||||
putRawValue(KEY_ADDRESSEE, metadata.get(Message.MESSAGE_TO), properties);
|
||||
|
||||
// Store each of To, CC and BCC in their own fields
|
||||
putRawValue(KEY_TO_NAMES, metadata.getValues(Metadata.MESSAGE_TO), properties);
|
||||
putRawValue(KEY_CC_NAMES, metadata.getValues(Metadata.MESSAGE_CC), properties);
|
||||
putRawValue(KEY_BCC_NAMES, metadata.getValues(Metadata.MESSAGE_BCC), properties);
|
||||
putRawValue(KEY_TO_NAMES, metadata.getValues(Message.MESSAGE_TO), properties);
|
||||
putRawValue(KEY_CC_NAMES, metadata.getValues(Message.MESSAGE_CC), properties);
|
||||
putRawValue(KEY_BCC_NAMES, metadata.getValues(Message.MESSAGE_BCC), properties);
|
||||
|
||||
// But store all email addresses (to/cc/bcc) in the addresses field
|
||||
putRawValue(KEY_ADDRESSEES, metadata.getValues(Metadata.MESSAGE_RECIPIENT_ADDRESS), properties);
|
||||
putRawValue(KEY_ADDRESSEES, metadata.getValues(Message.MESSAGE_RECIPIENT_ADDRESS), properties);
|
||||
|
||||
return properties;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@ -27,6 +27,8 @@
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Office;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.slf4j.Logger;
|
||||
@ -40,7 +42,7 @@ import java.util.Map;
|
||||
*
|
||||
* Configuration: (see OfficeMetadataExtractor_metadata_extract.properties and tika_engine_config.json)
|
||||
*
|
||||
* This extracter uses the POI library to extract the following:
|
||||
* This extractor uses the POI library to extract the following:
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
@ -91,23 +93,20 @@ public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
return new OfficeParser();
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String,String> headers)
|
||||
{
|
||||
putRawValue(KEY_CREATE_DATETIME, metadata.get(Metadata.CREATION_DATE), properties);
|
||||
putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(Metadata.LAST_SAVED), properties);
|
||||
putRawValue(KEY_EDIT_TIME, metadata.get(Metadata.EDIT_TIME), properties);
|
||||
putRawValue(KEY_FORMAT, metadata.get(Metadata.FORMAT), properties);
|
||||
putRawValue(KEY_KEYWORDS, metadata.get(Metadata.KEYWORDS), properties);
|
||||
putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties);
|
||||
putRawValue(KEY_LAST_PRINTED, metadata.get(Metadata.LAST_PRINTED), properties);
|
||||
// putRawValue(KEY_OS_VERSION, metadata.get(Metadata.OS_VERSION), properties);
|
||||
// putRawValue(KEY_THUMBNAIL, metadata.get(Metadata.THUMBNAIL), properties);
|
||||
putRawValue(KEY_PAGE_COUNT, metadata.get(Metadata.PAGE_COUNT), properties);
|
||||
putRawValue(KEY_PARAGRAPH_COUNT, metadata.get(Metadata.PARAGRAPH_COUNT), properties);
|
||||
putRawValue(KEY_WORD_COUNT, metadata.get(Metadata.WORD_COUNT), properties);
|
||||
putRawValue(KEY_CREATE_DATETIME, metadata.get(TikaCoreProperties.CREATED), properties);
|
||||
putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(TikaCoreProperties.MODIFIED), properties);
|
||||
putRawValue(KEY_EDIT_TIME, metadata.get(TikaCoreProperties.MODIFIED), properties);
|
||||
putRawValue(KEY_FORMAT, metadata.get(TikaCoreProperties.FORMAT), properties);
|
||||
putRawValue(KEY_KEYWORDS, metadata.get(TikaCoreProperties.SUBJECT), properties);
|
||||
putRawValue(KEY_LAST_AUTHOR, metadata.get(TikaCoreProperties.MODIFIER), properties);
|
||||
putRawValue(KEY_LAST_PRINTED, metadata.get(TikaCoreProperties.PRINT_DATE), properties);
|
||||
putRawValue(KEY_PAGE_COUNT, metadata.get(Office.PAGE_COUNT), properties);
|
||||
putRawValue(KEY_PARAGRAPH_COUNT, metadata.get(Office.PARAGRAPH_COUNT), properties);
|
||||
putRawValue(KEY_WORD_COUNT, metadata.get(Office.WORD_COUNT), properties);
|
||||
return properties;
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@ -26,18 +26,28 @@
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
|
||||
import static org.apache.tika.metadata.DublinCore.NAMESPACE_URI_DC;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.odf.OpenDocumentMetaParser;
|
||||
import org.apache.tika.parser.odf.OpenDocumentParser;
|
||||
import org.apache.tika.parser.xml.ElementMetadataHandler;
|
||||
import org.apache.tika.sax.TeeContentHandler;
|
||||
import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.ContentHandler;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* {@code "application/vnd.oasis.opendocument..."} and {@code "applicationvnd.oasis.opendocument..."} metadata extractor.
|
||||
@ -77,6 +87,7 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
private static final String KEY_INITIAL_CREATOR = "initialCreator";
|
||||
private static final String KEY_KEYWORD = "keyword";
|
||||
private static final String KEY_LANGUAGE = "language";
|
||||
private static final String KEY_ALFRESCO_CREATOR = "_alfresco:creator";
|
||||
|
||||
private static final String CUSTOM_PREFIX = "custom:";
|
||||
|
||||
@ -90,22 +101,33 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return new OpenDocumentParser();
|
||||
OpenDocumentParser parser = new OpenDocumentParser();
|
||||
parser.setMetaParser(new OpenDocumentMetaParser() {
|
||||
@Override
|
||||
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context)
|
||||
{
|
||||
final ContentHandler superHandler = super.getContentHandler(ch, md, context);
|
||||
final ContentHandler creatorHandler = new ElementMetadataHandler(NAMESPACE_URI_DC, KEY_CREATOR, md, KEY_ALFRESCO_CREATOR);
|
||||
return new TeeContentHandler(superHandler, creatorHandler);
|
||||
}
|
||||
});
|
||||
return parser;
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
@Override
|
||||
protected Map<String, Serializable> extractSpecific(Metadata metadata,
|
||||
Map<String, Serializable> properties, Map<String, String> headers)
|
||||
{
|
||||
putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(Metadata.CREATION_DATE)), properties);
|
||||
putRawValue(KEY_CREATOR, metadata.get(Metadata.CREATOR), properties);
|
||||
putRawValue(KEY_DATE, getDateOrNull(metadata.get(Metadata.DATE)), properties);
|
||||
putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.DESCRIPTION), properties);
|
||||
putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(TikaCoreProperties.CREATED)), properties);
|
||||
final String creator = getCreator(metadata);
|
||||
putRawValue(KEY_CREATOR, creator, properties);
|
||||
putRawValue(KEY_AUTHOR, creator, properties);
|
||||
putRawValue(KEY_DATE, getDateOrNull(metadata.get(TikaCoreProperties.MODIFIED)), properties);
|
||||
putRawValue(KEY_DESCRIPTION, metadata.get(TikaCoreProperties.DESCRIPTION), properties);
|
||||
putRawValue(KEY_GENERATOR, metadata.get("generator"), properties);
|
||||
putRawValue(KEY_INITIAL_CREATOR, metadata.get("initial-creator"), properties);
|
||||
putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties);
|
||||
putRawValue(KEY_LANGUAGE, metadata.get(Metadata.LANGUAGE), properties);
|
||||
putRawValue(KEY_KEYWORD, metadata.get(TikaCoreProperties.SUBJECT), properties);
|
||||
putRawValue(KEY_LANGUAGE, metadata.get(TikaCoreProperties.LANGUAGE), properties);
|
||||
|
||||
// Handle user-defined properties dynamically
|
||||
Map<String, Set<String>> mapping = super.getExtractMapping();
|
||||
@ -120,6 +142,18 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
return properties;
|
||||
}
|
||||
|
||||
private String getCreator(Metadata metadata)
|
||||
{
|
||||
final List<String> creators = distinct(metadata.getValues(TikaCoreProperties.CREATOR))
|
||||
.collect(Collectors.toUnmodifiableList());
|
||||
if (creators.size() == 1)
|
||||
{
|
||||
return creators.get(0);
|
||||
}
|
||||
|
||||
return metadata.get(KEY_ALFRESCO_CREATOR);
|
||||
}
|
||||
|
||||
private Date getDateOrNull(String dateString)
|
||||
{
|
||||
if (dateString != null && dateString.length() != 0)
|
||||
|
@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@ -28,6 +28,7 @@ package org.alfresco.transformer.metadataExtractors;
|
||||
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.metadata.XMPDM;
|
||||
import org.apache.tika.parser.CompositeParser;
|
||||
import org.apache.tika.parser.Parser;
|
||||
@ -148,13 +149,12 @@ public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
* @param metadata the metadata extracted from the file
|
||||
* @return the description
|
||||
*/
|
||||
@SuppressWarnings("deprecation")
|
||||
private String generateDescription(Metadata metadata)
|
||||
{
|
||||
StringBuilder result = new StringBuilder();
|
||||
if (metadata.get(Metadata.TITLE) != null)
|
||||
if (metadata.get(TikaCoreProperties.TITLE) != null)
|
||||
{
|
||||
result.append(metadata.get(Metadata.TITLE));
|
||||
result.append(metadata.get(TikaCoreProperties.TITLE));
|
||||
if (metadata.get(XMPDM.ALBUM) != null)
|
||||
{
|
||||
result
|
||||
|
@ -44,9 +44,9 @@ import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.io.output.NullOutputStream;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.io.IOUtils;
|
||||
import org.apache.tika.io.NullOutputStream;
|
||||
import org.apache.tika.io.TemporaryResources;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
@ -57,7 +57,7 @@ import org.apache.tika.parser.external.ExternalParser;
|
||||
import org.apache.tika.parser.external.ExternalParsersFactory;
|
||||
import org.apache.tika.parser.image.ImageParser;
|
||||
import org.apache.tika.parser.image.TiffParser;
|
||||
import org.apache.tika.parser.jpeg.JpegParser;
|
||||
import org.apache.tika.parser.image.JpegParser;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -263,7 +263,7 @@ public class ExifToolParser extends ExternalParser {
|
||||
* stream of the given process to the given XHTML content handler.
|
||||
* The standard output stream is closed once fully processed.
|
||||
*
|
||||
* @param process process
|
||||
* @param stream stream
|
||||
* @param xhtml XHTML content handler
|
||||
* @throws SAXException if the XHTML SAX events could not be handled
|
||||
* @throws IOException if an input error occurred
|
||||
@ -315,13 +315,13 @@ public class ExifToolParser extends ExternalParser {
|
||||
* standard stream of the given process. Potential exceptions
|
||||
* are ignored, and the stream is closed once fully processed.
|
||||
*
|
||||
* @param process process
|
||||
* @param stream stream
|
||||
*/
|
||||
private void ignoreStream(final InputStream stream) {
|
||||
Thread t = new Thread() {
|
||||
public void run() {
|
||||
try {
|
||||
IOUtils.copy(stream, new NullOutputStream());
|
||||
IOUtils.copy(stream, NullOutputStream.NULL_OUTPUT_STREAM);
|
||||
} catch (IOException e) {
|
||||
} finally {
|
||||
IOUtils.closeQuietly(stream);
|
||||
|
2
pom.xml
2
pom.xml
@ -27,7 +27,7 @@
|
||||
<dependency.jackson-databind.version>${dependency.jackson.version}</dependency.jackson-databind.version>
|
||||
<dependency.junit.version>4.13.2</dependency.junit.version>
|
||||
<dependency.cxf.version>3.5.0</dependency.cxf.version>
|
||||
<dependency.tika.version>1.26</dependency.tika.version>
|
||||
<dependency.tika.version>2.1.0</dependency.tika.version>
|
||||
<dependency.poi.version>4.1.2</dependency.poi.version>
|
||||
<dependency.ooxml-schemas.version>1.4</dependency.ooxml-schemas.version>
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user