Save point: [skip ci]

* Tika test
This commit is contained in:
alandavis
2022-07-26 15:00:20 +01:00
parent a04a26d6f6
commit c5a8958c26
13 changed files with 41 additions and 81 deletions

View File

@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata;
import org.alfresco.transform.base.TransformManager;
import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor;
@@ -51,7 +51,6 @@ import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
@@ -81,7 +80,7 @@ import java.util.stream.Stream;
* @author Nick Burch
* @author adavis
*/
public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtractor
public abstract class AbstractTikaMetadataExtractorEmbeddor extends AbstractMetadataExtractor
{
protected static final String KEY_AUTHOR = "author";
protected static final String KEY_TITLE = "title";
@@ -96,7 +95,7 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
private final DateTimeFormatter tikaUTCDateFormater;
private final DateTimeFormatter tikaDateFormater;
public AbstractTikaMetadataExtractor(Type type, Logger logger)
public AbstractTikaMetadataExtractorEmbeddor(Type type, Logger logger)
{
super(type, logger);
@@ -153,11 +152,6 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
return dateStr;
}
/**
* Returns the correct Tika Parser to process the document.
* If you don't know which you want, use {@link TikaAutoMetadataExtractor}
* which makes use of the Tika auto-detection.
*/
protected abstract Parser getParser();
/**
@@ -168,7 +162,6 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
*/
protected Embedder getEmbedder()
{
// TODO make this an abstract method once more extracters support embedding
return null;
}

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser;
@@ -57,7 +58,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
* @author adavis
*/
@Component
public class DWGMetadataExtractor extends AbstractTikaMetadataExtractor
public class DWGMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{
private static final Logger logger = LoggerFactory.getLogger(DWGMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.alfresco.transform.tika.parsers.ExifToolParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.metadata.Metadata;
@@ -44,7 +45,7 @@ import java.util.regex.Pattern;
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
@Component
public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor
public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{
private static final Logger logger = LoggerFactory.getLogger(IPTCMetadataExtractor.class);
@@ -118,7 +119,7 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor
* @return dateStrings in Iso8601 format
* @see #iptcToIso8601DateString
*/
protected String[] iptcToIso8601DateStrings(String[] dateStrings)
public String[] iptcToIso8601DateStrings(String[] dateStrings)
{
for (int i = 0; i < dateStrings.length; i++)
{

View File

@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata.extractors;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -63,7 +64,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
* @author adavis
*/
@Component
public class MailMetadataExtractor extends AbstractTikaMetadataExtractor
public class MailMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{
private static final Logger logger = LoggerFactory.getLogger(MailMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -71,7 +72,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
* @author adavis
*/
@Component
public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractor
public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{
private static final Logger logger = LoggerFactory.getLogger(OfficeMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
@@ -79,7 +80,7 @@ import static org.apache.tika.metadata.DublinCore.NAMESPACE_URI_DC;
* @author adavis
*/
@Component
public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{
private static final Logger logger = LoggerFactory.getLogger(OpenDocumentMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.alfresco.transform.tika.transformers.Tika;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.Metadata;
@@ -56,7 +57,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
* @author adavis
*/
@Component
public class PdfBoxMetadataExtractor extends AbstractTikaMetadataExtractor
public class PdfBoxMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{
private static final Logger logger = LoggerFactory.getLogger(PdfBoxMetadataExtractor.class);

View File

@@ -24,27 +24,15 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata.extractors;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.tika.embedder.Embedder;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Collections;
import java.util.Set;
import java.util.StringJoiner;
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
/**
@@ -66,7 +54,7 @@ import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExt
* @author adavis
*/
@Component
public class PoiMetadataExtractor extends AbstractTikaMetadataExtractor
public class PoiMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{
private static final Logger logger = LoggerFactory.getLogger(PoiMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -69,7 +70,7 @@ import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
* @author adavis
*/
@Component
public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractor
public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{
private static final Logger logger = LoggerFactory.getLogger(TikaAudioMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata.extractors;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TIFF;
@@ -63,7 +64,7 @@ import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
* @author adavis
*/
@Component
public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractor
public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
{
private static final Logger logger = LoggerFactory.getLogger(TikaAutoMetadataExtractor.class);

View File

@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
package org.alfresco.transform.tika.metadata.embedders;
import org.alfresco.transform.tika.metadata.AbstractTikaMetadataExtractorEmbeddor;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.tika.embedder.Embedder;
@@ -45,50 +46,20 @@ import java.util.Collections;
import java.util.Set;
import java.util.StringJoiner;
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EXTRACTOR;
import static org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor.Type.EMBEDDER;
/**
* Sample POI metadata embedder to demonstrate it is possible to add custom T-Engines that will add
* metadata. This is not production code, so no supported mimetypes exist in the {@code tika_engine_config.json}.
* Adding the following would make it available:
*
* <pre>
* {
* "transformOptions": {
* ...
* "metadataEmbedOptions": [
* {"value": {"name": "metadata", "required": true}}
* ]
* },
* "transformers": [
* ...
* {
* "transformerName": "PoiMetadataEmbedder",
* "supportedSourceAndTargetList": [
* ...
* {"sourceMediaType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "targetMediaType": "alfresco-metadata-embed"}
* ],
* "transformOptions": [
* "metadataEmbedOptions"
* ]
* }
* ]
* }
* </pre>
* @author Nick Burch
* @author Neil McErlean
* @author Dmitry Velichkevich
* @author adavis
*/
@Component
public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractor
public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractorEmbeddor
{
private static final Logger logger = LoggerFactory.getLogger(PoiMetadataEmbedder.class);
public PoiMetadataEmbedder()
{
super(EXTRACTOR, logger);
super(EMBEDDER, logger);
}
@Override
@@ -127,7 +98,7 @@ public class PoiMetadataEmbedder extends AbstractTikaMetadataExtractor
for (String name : metadata.names())
{
metadata.isMultiValued("description");
String value = null;
String value;
if (metadata.isMultiValued(name))
{
String[] values = metadata.getValues(name);

View File

@@ -24,12 +24,12 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.metadataExtractors;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
package org.alfresco.transform.tika.metadata.extractors;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
public class IPTCMetadataExtractorTest
{
IPTCMetadataExtractor extractor = new IPTCMetadataExtractor();