Save point: [skip ci]

* Tika test
This commit is contained in:
alandavis
2022-07-26 15:00:20 +01:00
parent a04a26d6f6
commit c5a8958c26
13 changed files with 41 additions and 81 deletions

View File

@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata;
import org.alfresco.transform.base.TransformManager; import org.alfresco.transform.base.TransformManager;
import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor; import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor;
@@ -51,7 +51,6 @@ import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler; import org.xml.sax.ContentHandler;
import org.xml.sax.Locator; import org.xml.sax.Locator;
import java.io.FileInputStream;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.io.Serializable; import java.io.Serializable;
@@ -81,7 +80,7 @@ import java.util.stream.Stream;
* @author Nick Burch * @author Nick Burch
* @author adavis * @author adavis
*/ */
public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtractor public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMetadataExtractor
{ {
protected static final String KEY_AUTHOR = "author"; protected static final String KEY_AUTHOR = "author";
protected static final String KEY_TITLE = "title"; protected static final String KEY_TITLE = "title";
@@ -96,7 +95,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
private final DateTimeFormatter tikaUTCDateFormater; private final DateTimeFormatter tikaUTCDateFormater;
private final DateTimeFormatter tikaDateFormater; private final DateTimeFormatter tikaDateFormater;
public AbstractTikaMetadataExtractor(Type type, Logger logger) public AbstractTikaMetadataExtractorEmbeddor(Type type, Logger logger)
{ {
super(type, logger); super(type, logger);
@@ -153,11 +152,6 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
return dateStr; return dateStr;
} }
/**
* Returns the correct Tika Parser to process the document.
* If you don't know which you want, use {@link TikaAutoMetadataExtractor}
* which makes use of the Tika auto-detection.
*/
protected abstract Parser getParser(); protected abstract Parser getParser();
/** /**
@@ -168,7 +162,6 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
*/ */
protected Embedder getEmbedder() protected Embedder getEmbedder()
{ {
// TODO make this an abstract method once more extracters support embedding
return null; return null;
} }

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser; import org.apache.tika.parser.Parser;
@@ -57,7 +58,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
* @author adavis * @author adavis
*/ */
@Component @Component
public class DWGMetadataExtractor extends AbstractTikaMetadataExtractor public class DWGMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{ {
private static final Logger logger = LoggerFactory.getLogger(DWGMetadataExtractor.class); private static final Logger logger = LoggerFactory.getLogger(DWGMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.alfresco.transform.tika.parsers.ExifToolParser; import org.alfresco.transform.tika.parsers.ExifToolParser;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
@@ -44,7 +45,7 @@ import java.util.regex.Pattern;
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR; import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
@Component @Component
public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{ {
private static final Logger logger = LoggerFactory.getLogger(IPTCMetadataExtractor.class); private static final Logger logger = LoggerFactory.getLogger(IPTCMetadataExtractor.class);
@@ -118,7 +119,7 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor
* @return dateStrings in Iso8601 format * @return dateStrings in Iso8601 format
* @see #iptcToIso8601DateString * @see #iptcToIso8601DateString
*/ */
protected String[] iptcToIso8601DateStrings(String[] dateStrings) public String[] iptcToIso8601DateStrings(String[] dateStrings)
{ {
for (int i = 0; i < dateStrings.length; i++) for (int i = 0; i < dateStrings.length; i++)
{ {

View File

@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.extractors;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaCoreProperties;

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaCoreProperties;
@@ -63,7 +64,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
* @author adavis * @author adavis
*/ */
@Component @Component
public class MailMetadataExtractor extends AbstractTikaMetadataExtractor public class MailMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{ {
private static final Logger logger = LoggerFactory.getLogger(MailMetadataExtractor.class); private static final Logger logger = LoggerFactory.getLogger(MailMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office; import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaCoreProperties;
@@ -71,7 +72,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
* @author adavis * @author adavis
*/ */
@Component @Component
public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractor public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{ {
private static final Logger logger = LoggerFactory.getLogger(OfficeMetadataExtractor.class); private static final Logger logger = LoggerFactory.getLogger(OfficeMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.ParseContext;
@@ -79,7 +80,7 @@ import static org.apache.tika.metadata.DublinCore.NAMESPACE_URI_DC;
* @author adavis * @author adavis
*/ */
@Component @Component
public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{ {
private static final Logger logger = LoggerFactory.getLogger(OpenDocumentMetadataExtractor.class); private static final Logger logger = LoggerFactory.getLogger(OpenDocumentMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.alfresco.transform.tika.transformers.Tika; import org.alfresco.transform.tika.transformers.Tika;
import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
@@ -56,7 +57,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
* @author adavis * @author adavis
*/ */
@Component @Component
public class PdfBoxMetadataExtractor extends AbstractTikaMetadataExtractor public class PdfBoxMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{ {
private static final Logger logger = LoggerFactory.getLogger(PdfBoxMetadataExtractor.class); private static final Logger logger = LoggerFactory.getLogger(PdfBoxMetadataExtractor.class);

View File

@@ -24,27 +24,15 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.extractors;
import org.apache.poi.ooxml.POIXMLProperties; import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.tika.embedder.Embedder;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser; import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Collections;
import java.util.Set;
import java.util.StringJoiner;
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR; import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
/** /**
@@ -66,7 +54,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
* @author adavis * @author adavis
*/ */
@Component @Component
public class PoiMetadataExtractor extends AbstractTikaMetadataExtractor public class PoiMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{ {
private static final Logger logger = LoggerFactory.getLogger(PoiMetadataExtractor.class); private static final Logger logger = LoggerFactory.getLogger(PoiMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.config.TikaConfig; import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaCoreProperties;
@@ -69,7 +70,7 @@ import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
* @author adavis * @author adavis
*/ */
@Component @Component
public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractor public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{ {
private static final Logger logger = LoggerFactory.getLogger(TikaAudioMetadataExtractor.class); private static final Logger logger = LoggerFactory.getLogger(TikaAudioMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.config.TikaConfig; import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TIFF; import org.apache.tika.metadata.TIFF;
@@ -63,7 +64,7 @@ import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
* @author adavis * @author adavis
*/ */
@Component @Component
public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractor public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{ {
private static final Logger logger = LoggerFactory.getLogger(TikaAutoMetadataExtractor.class); private static final Logger logger = LoggerFactory.getLogger(TikaAutoMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.embedders;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.poi.ooxml.POIXMLProperties; import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.tika.embedder.Embedder; import org.apache.tika.embedder.Embedder;
@@ -45,50 +46,20 @@ import java.util.Collections;
import java.util.Set; import java.util.Set;
import java.util.StringJoiner; import java.util.StringJoiner;
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR; import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EMBEDDER;
/** /**
* Sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add * Sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add
* metadata. This is not production code, so no supported mimetypes exist in the {@code tika_engine_config.json}. * metadata. This is not production code, so no supported mimetypes exist in the {@code tika_engine_config.json}.
* Adding the following would make it available:
*
* <pre>
* {
* "transformOptions": {
* ...
* "metadataEmbedOptions": [
* {"value": {"name": "metadata", "required": true}}
* ]
* },
* "transformers": [
* ...
* {
* "transformerName": "PoiMetadataEmbedder",
* "supportedSourceAndTargetList": [
* ...
* {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
* ],
* "transformOptions": [
* "metadataEmbedOptions"
* ]
* }
* ]
* }
* </pre>
* @author Nick Burch
* @author Neil McErlean
* @author Dmitry Velichkevich
* @author adavis
*/ */
@Component @Component
public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractor public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractorEmbeddor
{ {
private static final Logger logger = LoggerFactory.getLogger(PoiMetadataEmbedder.class); private static final Logger logger = LoggerFactory.getLogger(PoiMetadataEmbedder.class);
public PoiMetadataEmbedder() public PoiMetadataEmbedder()
{ {
super(EXTRACTOR, logger); super(EMBEDDER, logger);
} }
@Override @Override
@@ -127,7 +98,7 @@ public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractor
for (String name : metadata.names()) for (String name : metadata.names())
{ {
metadata.isMultiValued("description"); metadata.isMultiValued("description");
String value = null; String value;
if (metadata.isMultiValued(name)) if (metadata.isMultiValued(name))
{ {
String[] values = metadata.getValues(name); String[] values = metadata.getValues(name);

View File

@@ -24,12 +24,12 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>. * along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L% * #L%
*/ */
package org.alfresco.transform.tika.metadataExtractors; package org.alfresco.transform.tika.metadata.extractors;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
public class IPTCMetadataExtractorTest public class IPTCMetadataExtractorTest
{ {
IPTCMetadataExtractor extractor = new IPTCMetadataExtractor(); IPTCMetadataExtractor extractor = new IPTCMetadataExtractor();