mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-08-14 17:58:27 +00:00
Save point: [skip ci]
* Tika test
This commit is contained in:
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata;
|
||||
|
||||
import org.alfresco.transform.base.TransformManager;
|
||||
import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor;
|
||||
@@ -51,7 +51,6 @@ import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.Locator;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Serializable;
|
||||
@@ -81,7 +80,7 @@ import java.util.stream.Stream;
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtractor
|
||||
public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMetadataExtractor
|
||||
{
|
||||
protected static final String KEY_AUTHOR = "author";
|
||||
protected static final String KEY_TITLE = "title";
|
||||
@@ -96,7 +95,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
private final DateTimeFormatter tikaUTCDateFormater;
|
||||
private final DateTimeFormatter tikaDateFormater;
|
||||
|
||||
public AbstractTikaMetadataExtractor(Type type, Logger logger)
|
||||
public AbstractTikaMetadataExtractorEmbeddor(Type type, Logger logger)
|
||||
{
|
||||
super(type, logger);
|
||||
|
||||
@@ -153,11 +152,6 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
return dateStr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the correct Tika Parser to process the document.
|
||||
* If you don't know which you want, use {@link TikaAutoMetadataExtractor}
|
||||
* which makes use of the Tika auto-detection.
|
||||
*/
|
||||
protected abstract Parser getParser();
|
||||
|
||||
/**
|
||||
@@ -168,7 +162,6 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
*/
|
||||
protected Embedder getEmbedder()
|
||||
{
|
||||
// TODO make this an abstract method once more extracters support embedding
|
||||
return null;
|
||||
}
|
||||
|
@@ -24,8 +24,9 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.Parser;
|
||||
@@ -57,7 +58,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class DWGMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
public class DWGMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(DWGMetadataExtractor.class);
|
||||
|
@@ -24,8 +24,9 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.alfresco.transform.tika.parsers.ExifToolParser;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
@@ -44,7 +45,7 @@ import java.util.regex.Pattern;
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
@Component
|
||||
public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(IPTCMetadataExtractor.class);
|
||||
@@ -118,7 +119,7 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
* @return dateStrings in Iso8601 format
|
||||
* @see #iptcToIso8601DateString
|
||||
*/
|
||||
protected String[] iptcToIso8601DateStrings(String[] dateStrings)
|
||||
public String[] iptcToIso8601DateStrings(String[] dateStrings)
|
||||
{
|
||||
for (int i = 0; i < dateStrings.length; i++)
|
||||
{
|
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
@@ -24,8 +24,9 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.apache.tika.metadata.Message;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
@@ -63,7 +64,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class MailMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
public class MailMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(MailMetadataExtractor.class);
|
||||
|
@@ -24,8 +24,9 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Office;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
@@ -71,7 +72,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(OfficeMetadataExtractor.class);
|
||||
|
@@ -24,8 +24,9 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
@@ -79,7 +80,7 @@ import static org.apache.tika.metadata.DublinCore.NAMESPACE_URI_DC;
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(OpenDocumentMetadataExtractor.class);
|
||||
|
@@ -24,8 +24,9 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.alfresco.transform.tika.transformers.Tika;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
@@ -56,7 +57,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class PdfBoxMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
public class PdfBoxMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(PdfBoxMetadataExtractor.class);
|
||||
|
@@ -24,27 +24,15 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.apache.poi.ooxml.POIXMLProperties;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.apache.tika.embedder.Embedder;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
@@ -66,7 +54,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class PoiMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
public class PoiMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(PoiMetadataExtractor.class);
|
||||
|
@@ -24,8 +24,9 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
@@ -69,7 +70,7 @@ import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TikaAudioMetadataExtractor.class);
|
||||
|
@@ -24,8 +24,9 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TIFF;
|
||||
@@ -63,7 +64,7 @@ import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TikaAutoMetadataExtractor.class);
|
||||
|
@@ -24,8 +24,9 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadata.embedders;
|
||||
|
||||
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
|
||||
import org.apache.poi.ooxml.POIXMLProperties;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.apache.tika.embedder.Embedder;
|
||||
@@ -45,50 +46,20 @@ import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
|
||||
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EMBEDDER;
|
||||
|
||||
/**
|
||||
* Sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add
|
||||
* metadata. This is not production code, so no supported mimetypes exist in the {@code tika_engine_config.json}.
|
||||
* Adding the following would make it available:
|
||||
*
|
||||
* <pre>
|
||||
* {
|
||||
* "transformOptions": {
|
||||
* ...
|
||||
* "metadataEmbedOptions": [
|
||||
* {"value": {"name": "metadata", "required": true}}
|
||||
* ]
|
||||
* },
|
||||
* "transformers": [
|
||||
* ...
|
||||
* {
|
||||
* "transformerName": "PoiMetadataEmbedder",
|
||||
* "supportedSourceAndTargetList": [
|
||||
* ...
|
||||
* {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
|
||||
* ],
|
||||
* "transformOptions": [
|
||||
* "metadataEmbedOptions"
|
||||
* ]
|
||||
* }
|
||||
* ]
|
||||
* }
|
||||
* </pre>
|
||||
|
||||
* @author Nick Burch
|
||||
* @author Neil McErlean
|
||||
* @author Dmitry Velichkevich
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractor
|
||||
public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractorEmbeddor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(PoiMetadataEmbedder.class);
|
||||
|
||||
public PoiMetadataEmbedder()
|
||||
{
|
||||
super(EXTRACTOR, logger);
|
||||
super(EMBEDDER, logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -127,7 +98,7 @@ public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractor
|
||||
for (String name : metadata.names())
|
||||
{
|
||||
metadata.isMultiValued("description");
|
||||
String value = null;
|
||||
String value;
|
||||
if (metadata.isMultiValued(name))
|
||||
{
|
||||
String[] values = metadata.getValues(name);
|
@@ -24,12 +24,12 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
package org.alfresco.transform.tika.metadata.extractors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
|
||||
public class IPTCMetadataExtractorTest
|
||||
{
|
||||
IPTCMetadataExtractor extractor = new IPTCMetadataExtractor();
|
Reference in New Issue
Block a user