Save point: [skip ci]

* Beginnings of new t-base (using TransformEngine and CustomeTransformer, no need for a controller of Application in t-engine modules)
* Using org.alfresco.transform.<module> package
* Beginnings of new Tika t-engine
This commit is contained in:
alandavis
2022-06-30 13:39:24 +01:00
parent 78c82c9a01
commit b619f27207
157 changed files with 8740 additions and 1543 deletions

View File

@@ -20,12 +20,12 @@
<dependencies>
<dependency>
<groupId>org.alfresco</groupId>
<artifactId>alfresco-transformer-base</artifactId>
<artifactId>alfresco-t-engine-base</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.alfresco</groupId>
<artifactId>alfresco-transformer-base</artifactId>
<artifactId>alfresco-t-engine-base</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
@@ -146,6 +146,9 @@
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<mainClass>org.alfresco.transform.base.Application</mainClass>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>

View File

@@ -1,77 +0,0 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
import io.micrometer.core.instrument.MeterRegistry;
import org.alfresco.transformer.executors.TikaJavaExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.actuate.autoconfigure.metrics.MeterRegistryCustomizer;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
import org.springframework.boot.context.event.ApplicationReadyEvent;
import org.springframework.context.annotation.Bean;
import org.springframework.context.event.EventListener;
import java.util.Arrays;
import static org.alfresco.transformer.logging.StandardMessages.LICENCE;
@SpringBootApplication
@EnableAutoConfiguration(exclude = {DataSourceAutoConfiguration.class})
public class Application
{
private static final Logger logger = LoggerFactory.getLogger(Application.class);
@Value("${container.name}")
private String containerName;
@Bean
MeterRegistryCustomizer<MeterRegistry> metricsCommonTags()
{
return registry -> registry.config().commonTags("containerName", containerName);
}
public static void main(String[] args)
{
SpringApplication.run(Application.class, args);
}
@EventListener(ApplicationReadyEvent.class)
public void startup()
{
logger.info("--------------------------------------------------------------------------------------------------------------------------------------------------------------");
Arrays.stream(LICENCE.split("\\n")).forEach(logger::info);
Arrays.stream(TikaJavaExecutor.LICENCE.split("\\n")).forEach(logger::info);
logger.info("--------------------------------------------------------------------------------------------------------------------------------------------------------------");
logger.info("Starting application components... Done");
}
}

View File

@@ -1,111 +0,0 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
import org.alfresco.transformer.executors.TikaJavaExecutor;
import org.alfresco.transformer.probes.ProbeTestTransform;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Controller;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
import static org.alfresco.transformer.executors.Tika.PDF_BOX;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_PDF;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
/**
* Controller for the Docker based Tika transformers.
*
* Status Codes:
*
* 200 Success
* 400 Bad Request: Invalid target mimetype <mimetype>
* 400 Bad Request: Request parameter <name> is missing (missing mandatory parameter)
* 400 Bad Request: Request parameter <name> is of the wrong type
* 400 Bad Request: Transformer exit code was not 0 (possible problem with the source file)
* 400 Bad Request: The source filename was not supplied
* 500 Internal Server Error: (no message with low level IO problems)
* 500 Internal Server Error: The target filename was not supplied (should not happen as targetExtension is checked)
* 500 Internal Server Error: Transformer version check exit code was not 0
* 500 Internal Server Error: Transformer version check failed to create any output
* 500 Internal Server Error: Could not read the target file
* 500 Internal Server Error: The target filename was malformed (should not happen because of other checks)
* 500 Internal Server Error: Transformer failed to create an output file (the exit code was 0, so there should be some content)
* 500 Internal Server Error: Filename encoding error
* 507 Insufficient Storage: Failed to store the source file
*/
@Controller
public class TikaController extends AbstractTransformerController
{
private static final Logger logger = LoggerFactory.getLogger(TikaController.class);
private TikaJavaExecutor javaExecutor;
public TikaController(@Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}") boolean notExtractBookmarksTextDefault)
{
javaExecutor= new TikaJavaExecutor(notExtractBookmarksTextDefault);
}
@Override
public String getTransformerName()
{
return "Tika";
}
@Override
public String version()
{
return "Tika available";
}
@Override
public ProbeTestTransform getProbeTestTransform()
{
// See the Javadoc on this method and Probes.md for the choice of these values.
// the livenessPercentage is a little large as Tika does tend to suffer from slow transforms that class with a gc.
return new ProbeTestTransform(this, "quick.pdf", "quick.txt",
60, 16, 400, 10240, 60 * 30 + 1, 60 * 15 + 20)
{
@Override
protected void executeTransformCommand(File sourceFile, File targetFile)
{
transformImpl(PDF_BOX, MIMETYPE_PDF, MIMETYPE_TEXT_PLAIN, new HashMap<>(), sourceFile, targetFile);
}
};
}
@Override
public void transformImpl(String transformName, String sourceMimetype, String targetMimetype,
Map<String, String> transformOptions, File sourceFile, File targetFile)
{
javaExecutor.transformExtractOrEmbed(transformName, sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
}
}

View File

@@ -24,30 +24,30 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
package org.alfresco.transform.tika;
import static java.nio.file.Files.readAllBytes;
import static org.alfresco.transform.common.RequestParamMap.ENDPOINT_TRANSFORM;
import static org.alfresco.transformer.executors.Tika.ARCHIVE;
import static org.alfresco.transformer.executors.Tika.CSV;
import static org.alfresco.transformer.executors.Tika.DOC;
import static org.alfresco.transformer.executors.Tika.DOCX;
import static org.alfresco.transformer.executors.Tika.HTML;
import static org.alfresco.transformer.executors.Tika.MSG;
import static org.alfresco.transformer.executors.Tika.OUTLOOK_MSG;
import static org.alfresco.transformer.executors.Tika.PDF;
import static org.alfresco.transformer.executors.Tika.PDF_BOX;
import static org.alfresco.transformer.executors.Tika.POI;
import static org.alfresco.transformer.executors.Tika.POI_OFFICE;
import static org.alfresco.transformer.executors.Tika.POI_OO_XML;
import static org.alfresco.transformer.executors.Tika.PPTX;
import static org.alfresco.transformer.executors.Tika.TEXT_MINING;
import static org.alfresco.transformer.executors.Tika.TIKA_AUTO;
import static org.alfresco.transformer.executors.Tika.TXT;
import static org.alfresco.transformer.executors.Tika.XHTML;
import static org.alfresco.transformer.executors.Tika.XML;
import static org.alfresco.transformer.executors.Tika.XSLX;
import static org.alfresco.transformer.executors.Tika.ZIP;
import static org.alfresco.transform.tika.transformers.Tika.ARCHIVE;
import static org.alfresco.transform.tika.transformers.Tika.CSV;
import static org.alfresco.transform.tika.transformers.Tika.DOC;
import static org.alfresco.transform.tika.transformers.Tika.DOCX;
import static org.alfresco.transform.tika.transformers.Tika.HTML;
import static org.alfresco.transform.tika.transformers.Tika.MSG;
import static org.alfresco.transform.tika.transformers.Tika.OUTLOOK_MSG;
import static org.alfresco.transform.tika.transformers.Tika.PDF;
import static org.alfresco.transform.tika.transformers.Tika.PDF_BOX;
import static org.alfresco.transform.tika.transformers.Tika.POI;
import static org.alfresco.transform.tika.transformers.Tika.OFFICE;
import static org.alfresco.transform.tika.transformers.Tika.OOXML;
import static org.alfresco.transform.tika.transformers.Tika.PPTX;
import static org.alfresco.transform.tika.transformers.Tika.TEXT_MINING;
import static org.alfresco.transform.tika.transformers.Tika.TIKA_AUTO;
import static org.alfresco.transform.tika.transformers.Tika.TXT;
import static org.alfresco.transform.tika.transformers.Tika.XHTML;
import static org.alfresco.transform.tika.transformers.Tika.XML;
import static org.alfresco.transform.tika.transformers.Tika.XSLX;
import static org.alfresco.transform.tika.transformers.Tika.ZIP;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_METADATA_EMBED;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_PRESENTATION;
@@ -61,8 +61,8 @@ import static org.alfresco.transform.common.Mimetype.MIMETYPE_WORD;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_ZIP;
import static org.alfresco.transformer.util.RequestParamMap.INCLUDE_CONTENTS;
import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT;
import static org.alfresco.transform.base.util.RequestParamMap.INCLUDE_CONTENTS;
import static org.alfresco.transform.base.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
@@ -91,12 +91,14 @@ import java.util.UUID;
import javax.servlet.http.HttpServletRequest;
import org.alfresco.transform.base.AbstractTransformerControllerTest;
import org.alfresco.transform.base.TransformController;
import org.alfresco.transform.client.model.TransformReply;
import org.alfresco.transform.client.model.TransformRequest;
import org.alfresco.transformer.executors.RuntimeExec;
import org.alfresco.transformer.model.FileRefEntity;
import org.alfresco.transformer.model.FileRefResponse;
import org.alfresco.transformer.probes.ProbeTestTransform;
import org.alfresco.transform.base.executors.RuntimeExec;
import org.alfresco.transform.base.model.FileRefEntity;
import org.alfresco.transform.base.model.FileRefResponse;
import org.alfresco.transform.base.probes.ProbeTestTransform;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.jupiter.api.BeforeEach;
@@ -142,9 +144,6 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
@Mock
private RuntimeExec mockCheckCommand;
@Autowired
protected AbstractTransformerController controller;
private String targetEncoding = "UTF-8";
private String targetMimetype = MIMETYPE_TEXT_PLAIN;
@@ -236,12 +235,6 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
when(mockExecutionResult.getStdOut()).thenReturn("STDOUT");
}
@Override
protected AbstractTransformerController getController()
{
return controller;
}
private void transform(String transform, String sourceExtension, String targetExtension,
String sourceMimetype, String targetMimetype,
Boolean includeContents, String expectedContentContains) throws Exception
@@ -284,9 +277,10 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
public void testImmutableEmptyMap()
{
// See ACS-373
ProbeTestTransform probeTestTransform = getController().getProbeTestTransform();
TransformController controller = getController();
ProbeTestTransform probeTestTransform = getProbeTestTransform();
ReflectionTestUtils.setField(probeTestTransform, "livenessTransformEnabled", true);
probeTestTransform.doTransformOrNothing(httpServletRequest, true);
probeTestTransform.doTransformOrNothing(httpServletRequest, true, controller);
}
@Test
@@ -483,14 +477,14 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
@Test
public void msgToTxtOfficeTest() throws Exception
{
transform(POI_OFFICE, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null,
transform(OFFICE, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null,
EXPECTED_MSG_CONTENT_CONTAINS);
}
@Test
public void docToTxtOfficeTest() throws Exception
{
transform(POI_OFFICE, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null,
transform(OFFICE, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null,
EXPECTED_TEXT_CONTENT_CONTAINS);
}
@@ -508,14 +502,14 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
@Test
public void docxToTxtOoXmlTest() throws Exception
{
transform(POI_OO_XML, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null,
transform(OOXML, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null,
EXPECTED_TEXT_CONTENT_CONTAINS);
}
@Test
public void pptxToTxtOoXmlTest() throws Exception
{
transform(POI_OO_XML, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null,
transform(OOXML, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null,
EXPECTED_TEXT_CONTENT_CONTAINS);
}

View File

@@ -24,15 +24,11 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
import static org.springframework.http.MediaType.MULTIPART_FORM_DATA;
package org.alfresco.transform.tika;
import org.alfresco.transform.base.AbstractHttpRequestTest;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.context.SpringBootTest.WebEnvironment;
import org.springframework.core.io.ClassPathResource;
import org.springframework.http.HttpEntity;
import org.springframework.http.HttpHeaders;
import org.springframework.util.LinkedMultiValueMap;
/**

View File

@@ -24,11 +24,11 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
package org.alfresco.transform.tika;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_APP_DWG;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OUTLOOK_MSG;
import static org.alfresco.transformer.TestFileInfo.testFile;
import static org.alfresco.transform.base.TestFileInfo.testFile;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_AUDIO_MP4;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_EXCEL;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_BMP;
@@ -74,6 +74,8 @@ import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_RAW_NEF;
import java.util.stream.Stream;
import org.alfresco.transform.base.AbstractMetadataExtractsIT;
import org.alfresco.transform.base.TestFileInfo;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;

View File

@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
package org.alfresco.transform.tika;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_WORDPROCESSING;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
@@ -32,6 +32,7 @@ import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
import java.util.UUID;
import org.alfresco.transform.client.model.TransformRequest;
import org.alfresco.transform.base.AbstractQueueTransformServiceIT;
import org.springframework.boot.test.context.SpringBootTest;
/**

View File

@@ -24,11 +24,11 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer;
package org.alfresco.transform.tika;
import static java.text.MessageFormat.format;
import static java.util.function.Function.identity;
import static org.alfresco.transformer.EngineClient.sendTRequest;
import static org.alfresco.transform.base.EngineClient.sendTRequest;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;
import static org.springframework.http.HttpStatus.OK;

View File

@@ -14,7 +14,7 @@
<dependencies>
<dependency>
<groupId>org.alfresco</groupId>
<artifactId>alfresco-transformer-base</artifactId>
<artifactId>alfresco-t-engine-base</artifactId>
<version>${project.version}</version>
</dependency>

View File

@@ -0,0 +1,78 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika;
import org.alfresco.transform.base.TransformEngine;
import org.alfresco.transform.base.probes.ProbeTestTransform;
import org.alfresco.transform.common.TransformConfigResourceReader;
import org.alfresco.transform.config.TransformConfig;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.util.Collections;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_PDF;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
@Component
public class TikaTransformEngine implements TransformEngine
{
private static final String LICENCE =
"This transformer uses Tika from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt\n" +
"This transformer uses ExifTool by Phil Harvey. See license at https://exiftool.org/#license. or in /Perl-Artistic-License.txt";
@Autowired
private TransformConfigResourceReader transformConfigResourceReader;
@Value("${transform.core.config.location:classpath:engine_config.json}")
private String engineConfigLocation;
@Override
public String getTransformEngineName()
{
return "0001-Tika";
}
@Override
public String getStartupMessage() {
return LICENCE;
}
@Override
public TransformConfig getTransformConfig()
{
return transformConfigResourceReader.read(engineConfigLocation);
}
@Override
public ProbeTestTransform getLivenessAndReadinessProbeTestTransform()
{
return new ProbeTestTransform("quick.pdf", "quick.txt",
MIMETYPE_PDF, MIMETYPE_TEXT_PLAIN, Collections.emptyMap(),
60, 16, 400, 10240, 60 * 30 + 1, 60 * 15 + 20);
}
}

View File

@@ -24,13 +24,15 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import org.alfresco.transform.base.CustomTransformer;
import org.alfresco.transform.common.TransformException;
import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor;
import org.apache.tika.embedder.Embedder;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
@@ -67,6 +69,8 @@ import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
/**
* The parent of all Metadata Extractors which use Apache Tika under the hood. This handles all the
* common parts of processing the files, and the common mappings.
@@ -82,7 +86,7 @@ import java.util.stream.Stream;
* @author Nick Burch
* @author adavis
*/
public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtractor
public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtractor implements CustomTransformer
{
protected static final String KEY_AUTHOR = "author";
protected static final String KEY_TITLE = "title";
@@ -97,9 +101,17 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
private final DateTimeFormatter tikaUTCDateFormater;
private final DateTimeFormatter tikaDateFormater;
public AbstractTikaMetadataExtractor(Logger logger)
public static enum Type
{
EXTRACTOR, EMBEDDER
}
private final Type type;
public AbstractTikaMetadataExtractor(Type type, Logger logger)
{
super(logger);
this.type = type;
// TODO Once TIKA-451 is fixed this list will get nicer
DateTimeParser[] parsersUTC = {
@@ -118,6 +130,26 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
tikaDateFormater = new DateTimeFormatterBuilder().append(null, parsers).toFormatter();
}
@Override
public String getTransformerName() {
return getClass().getSimpleName();
}
@Override
public void transform(String sourceMimetype, String sourceEncoding, InputStream inputStream,
String targetMimetype, String targetEncoding, OutputStream outputStream,
Map<String, String> transformOptions) throws Exception
{
if (type == EXTRACTOR)
{
extractMetadata(sourceMimetype, transformOptions, sourceEncoding, inputStream, targetEncoding, outputStream);
}
else
{
embedMetadata(sourceMimetype, transformOptions, sourceEncoding, inputStream, targetEncoding, outputStream);
}
}
/**
* Version which also tries the ISO-8601 formats (in order..),
* and similar formats, which Tika makes use of
@@ -308,6 +340,14 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
return rawProperties;
}
public void embedMetadata(String sourceMimetype, Map<String, String> transformOptions,
String sourceEncoding, InputStream inputStream,
String targetEncoding, OutputStream outputStream) throws Exception
{
// TODO
throw new TransformException(500, "TODO embedMetadata");
}
/**
* @deprecated The content repository's TikaPoweredMetadataExtracter provides no non test implementations.
* This code exists in case there are custom implementations, that need to be converted to T-Engines.

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -32,10 +32,13 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.dwg.DWGParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.Serializable;
import java.util.Map;
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
/**
* {@code "application/dwg"} and {@code "image/vnd.dwg"} metadata extractor.
*
@@ -53,6 +56,7 @@ import java.util.Map;
* @author Nick Burch
* @author adavis
*/
@Component
public class DWGMetadataExtractor extends AbstractTikaMetadataExtractor
{
private static final Logger logger = LoggerFactory.getLogger(DWGMetadataExtractor.class);
@@ -62,7 +66,7 @@ public class DWGMetadataExtractor extends AbstractTikaMetadataExtractor
public DWGMetadataExtractor()
{
super(logger);
super(EXTRACTOR, logger);
}
@Override

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import java.io.Serializable;
import java.util.Arrays;
@@ -33,13 +33,17 @@ import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.alfresco.transformer.tika.parsers.ExifToolParser;
import org.alfresco.transform.tika.parsers.ExifToolParser;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
@Component
public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor
{
@@ -53,7 +57,7 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor
public IPTCMetadataExtractor()
{
super(logger);
super(EXTRACTOR, logger);
}
@Override

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -33,10 +33,13 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.mp3.Mp3Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.Serializable;
import java.util.Map;
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
/**
* MP3 file metadata extractor.
*
@@ -63,6 +66,7 @@ import java.util.Map;
* @author Nick Burch
* @author adavis
*/
@Component
public class MP3MetadataExtractor extends TikaAudioMetadataExtractor
{
private static final Logger logger = LoggerFactory.getLogger(MP3MetadataExtractor.class);

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
@@ -33,10 +33,13 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.Serializable;
import java.util.Map;
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
/**
* Outlook MAPI format email metadata extractor.
*
@@ -59,6 +62,7 @@ import java.util.Map;
* @author Kevin Roast
* @author adavis
*/
@Component
public class MailMetadataExtractor extends AbstractTikaMetadataExtractor
{
private static final Logger logger = LoggerFactory.getLogger(MailMetadataExtractor.class);
@@ -74,7 +78,7 @@ public class MailMetadataExtractor extends AbstractTikaMetadataExtractor
public MailMetadataExtractor()
{
super(logger);
super(EXTRACTOR, logger);
}
@Override

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -33,10 +33,13 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.Serializable;
import java.util.Map;
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
/**
* Office file format metadata extractor.
*
@@ -67,6 +70,7 @@ import java.util.Map;
* @author Nick Burch
* @author adavis
*/
@Component
public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractor
{
private static final Logger logger = LoggerFactory.getLogger(OfficeMetadataExtractor.class);
@@ -84,7 +88,7 @@ public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractor
public OfficeMetadataExtractor()
{
super(logger);
super(EXTRACTOR, logger);
}
@Override

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,8 +24,9 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
import static org.apache.tika.metadata.DublinCore.NAMESPACE_URI_DC;
import org.apache.tika.metadata.Metadata;
@@ -40,6 +41,7 @@ import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import org.xml.sax.ContentHandler;
import java.io.Serializable;
@@ -76,6 +78,7 @@ import java.util.stream.Collectors;
* @author Derek Hulley
* @author adavis
*/
@Component
public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
{
private static final Logger logger = LoggerFactory.getLogger(OpenDocumentMetadataExtractor.class);
@@ -95,7 +98,7 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
public OpenDocumentMetadataExtractor()
{
super(logger);
super(EXTRACTOR, logger);
}
@Override

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,15 +24,18 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import org.alfresco.transformer.executors.Tika;
import org.alfresco.transform.tika.transformers.Tika;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pdf.PDFParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
/**
* Metadata extractor for the PDF documents.
@@ -52,13 +55,14 @@ import org.slf4j.LoggerFactory;
* @author Derek Hulley
* @author adavis
*/
@Component
public class PdfBoxMetadataExtractor extends AbstractTikaMetadataExtractor
{
private static final Logger logger = LoggerFactory.getLogger(PdfBoxMetadataExtractor.class);
public PdfBoxMetadataExtractor()
{
super(logger);
super(EXTRACTOR, logger);
}
@Override

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
@@ -36,6 +36,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.io.InputStream;
@@ -44,6 +45,8 @@ import java.util.Collections;
import java.util.Set;
import java.util.StringJoiner;
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
/**
* POI-based metadata extractor for Office 07 documents. See http://poi.apache.org/ for information on POI.
*
@@ -92,13 +95,14 @@ import java.util.StringJoiner;
* @author Dmitry Velichkevich
* @author adavis
*/
@Component
public class PoiMetadataExtractor extends AbstractTikaMetadataExtractor
{
private static final Logger logger = LoggerFactory.getLogger(PoiMetadataExtractor.class);
public PoiMetadataExtractor()
{
super(logger);
super(EXTRACTOR, logger);
}
@Override

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
@@ -37,12 +37,14 @@ import org.gagravarr.tika.FlacParser;
import org.gagravarr.tika.VorbisParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.Serializable;
import java.util.Calendar;
import java.util.Map;
import static org.alfresco.transformer.executors.Tika.readTikaConfig;
import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
/**
* A Metadata Extractor which makes use of the Apache Tika Audio Parsers to extract metadata from media files.
@@ -66,6 +68,7 @@ import static org.alfresco.transformer.executors.Tika.readTikaConfig;
* @author Nick Burch
* @author adavis
*/
@Component
public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractor
{
private static final Logger logger = LoggerFactory.getLogger(TikaAudioMetadataExtractor.class);
@@ -86,7 +89,7 @@ public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractor
public TikaAudioMetadataExtractor(Logger logger)
{
super(logger);
super(EXTRACTOR, logger);
tikaConfig = readTikaConfig(logger);
}

View File

@@ -24,9 +24,8 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import org.alfresco.transform.common.Mimetype;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TIFF;
@@ -34,12 +33,14 @@ import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.Serializable;
import java.util.Map;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
import static org.alfresco.transformer.executors.Tika.readTikaConfig;
import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
/**
* A Metadata Extractor which makes use of the Apache Tika auto-detection to select the best parser to extract the
@@ -61,6 +62,7 @@ import static org.alfresco.transformer.executors.Tika.readTikaConfig;
* @author Nick Burch
* @author adavis
*/
@Component
public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractor
{
private static final Logger logger = LoggerFactory.getLogger(TikaAutoMetadataExtractor.class);
@@ -75,7 +77,7 @@ public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractor
public TikaAutoMetadataExtractor()
{
super(logger);
super(EXTRACTOR, logger);
tikaConfig = readTikaConfig(logger);
}

View File

@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.tika.parsers;
package org.alfresco.transform.tika.parsers;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.executors;
package org.alfresco.transform.tika.parsers;
import java.io.IOException;
import java.io.InputStream;

View File

@@ -0,0 +1,42 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.transformers;
import org.apache.tika.parser.Parser;
import org.springframework.stereotype.Component;
import javax.annotation.PostConstruct;
@Component
public class ArchiveTransformer extends GenericTikaTransformer
{
@Override
protected Parser getParser()
{
return tika.packageParser;
}
}

View File

@@ -0,0 +1,146 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.transformers;
import org.alfresco.transform.base.CustomTransformer;
import org.alfresco.transform.base.logging.LogEntry;
import org.alfresco.transform.base.util.RequestParamMap;
import org.alfresco.transform.common.TransformException;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Map;
import java.util.StringJoiner;
import static java.lang.Boolean.parseBoolean;
public abstract class GenericTikaTransformer implements CustomTransformer
{
private static final Logger logger = LoggerFactory.getLogger(GenericTikaTransformer.class);
@Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}")
boolean notExtractBookmarksTextDefault;
@Autowired
protected Tika tika;
protected abstract Parser getParser();
protected DocumentSelector getDocumentSelector()
{
return null;
}
@Override
public String getTransformerName()
{
String simpleClassName = getClass().getSimpleName();
return simpleClassName.substring(0, simpleClassName.length()-"Transformer".length());
}
@Override
public void transform(String sourceMimetype, String sourceEncoding, InputStream inputStream,
String targetMimetype, String targetEncoding, OutputStream outputStream,
Map<String, String> transformOptions) throws Exception
{
// TODO
throw new TransformException(500, "TODO GenericTikaTransformer transform with InputStreams");
}
public void transform(String transformName, String sourceMimetype, String targetMimetype,
Map<String, String> transformOptions, File sourceFile, File targetFile)
throws Exception
{
final boolean includeContents = parseBoolean(
transformOptions.getOrDefault(RequestParamMap.INCLUDE_CONTENTS, "false"));
final boolean notExtractBookmarksText = parseBoolean(
transformOptions.getOrDefault(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT, String.valueOf(notExtractBookmarksTextDefault)));
final String targetEncoding = transformOptions.getOrDefault("targetEncoding", "UTF-8");
if (transformOptions.get(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT) == null && notExtractBookmarksTextDefault)
{
logger.trace("notExtractBookmarksText default value has been overridden to {}", notExtractBookmarksTextDefault);
}
call(sourceFile, targetFile, transformName,
includeContents ? Tika.INCLUDE_CONTENTS : null,
notExtractBookmarksText ? Tika.NOT_EXTRACT_BOOKMARKS_TEXT : null,
Tika.TARGET_MIMETYPE + targetMimetype, Tika.TARGET_ENCODING + targetEncoding);
}
void call(File sourceFile, File targetFile, String... args)
{
Parser parser = getParser();
DocumentSelector documentSelector = getDocumentSelector();
args = buildArgs(sourceFile, targetFile, args);
tika.transform(parser, documentSelector, args);
}
private static String[] buildArgs(File sourceFile, File targetFile, String[] args)
{
ArrayList<String> methodArgs = new ArrayList<>(args.length + 2);
StringJoiner sj = new StringJoiner(" ");
for (String arg : args)
{
addArg(methodArgs, sj, arg);
}
addFileArg(methodArgs, sj, sourceFile);
addFileArg(methodArgs, sj, targetFile);
LogEntry.setOptions(sj.toString());
return methodArgs.toArray(new String[0]);
}
private static void addArg(ArrayList<String> methodArgs, StringJoiner sj, String arg)
{
if (arg != null)
{
sj.add(arg);
methodArgs.add(arg);
}
}
private static void addFileArg(ArrayList<String> methodArgs, StringJoiner sj, File arg)
{
if (arg != null)
{
String path = arg.getAbsolutePath();
int i = path.lastIndexOf('.');
String ext = i == -1 ? "???" : path.substring(i + 1);
sj.add(ext);
methodArgs.add(path);
}
}
}

View File

@@ -0,0 +1,40 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.transformers;
import org.apache.tika.parser.Parser;
import org.springframework.stereotype.Component;
@Component
public class OOXMLTransformer extends GenericTikaTransformer
{
@Override
protected Parser getParser()
{
return tika.ooXmlParser;
}
}

View File

@@ -0,0 +1,40 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.transformers;
import org.apache.tika.parser.Parser;
import org.springframework.stereotype.Component;
@Component
public class OfficeTransformer extends GenericTikaTransformer
{
@Override
protected Parser getParser()
{
return tika.officeParser;
}
}

View File

@@ -0,0 +1,40 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.transformers;
import org.apache.tika.parser.Parser;
import org.springframework.stereotype.Component;
@Component
public class OutlookMsgTransformer extends GenericTikaTransformer
{
@Override
protected Parser getParser()
{
return tika.officeParser;
}
}

View File

@@ -0,0 +1,47 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.transformers;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.parser.Parser;
import org.springframework.stereotype.Component;
@Component
public class PdfBoxTransformer extends GenericTikaTransformer
{
@Override
protected Parser getParser()
{
return tika.pdfParser;
}
@Override
protected DocumentSelector getDocumentSelector()
{
return tika.pdfBoxEmbededDocumentSelector;
}
}

View File

@@ -0,0 +1,40 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.transformers;
import org.apache.tika.parser.Parser;
import org.springframework.stereotype.Component;
@Component
public class PoiTransformer extends GenericTikaTransformer
{
@Override
protected Parser getParser()
{
return tika.tikaOfficeDetectParser;
}
}

View File

@@ -0,0 +1,40 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.transformers;
import org.apache.tika.parser.Parser;
import org.springframework.stereotype.Component;
@Component
public class TextMiningTransformer extends GenericTikaTransformer
{
@Override
protected Parser getParser()
{
return tika.officeParser;
}
}

View File

@@ -0,0 +1,446 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.transformers;
import com.google.common.collect.ImmutableList;
import org.alfresco.transform.tika.parsers.TikaOfficeDetectParser;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.parser.pkg.PackageParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.slf4j.Logger;
import org.springframework.stereotype.Component;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URL;
import java.util.List;
import java.util.regex.Pattern;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_PNG;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_TIFF;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_CSV;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
@Component
public class Tika
{
public static final String ARCHIVE = "Archive";
public static final String OUTLOOK_MSG = "OutlookMsg";
public static final String PDF_BOX = "PdfBox";
public static final String OFFICE = "Office";
public static final String POI = "Poi";
public static final String OOXML = "OOXML";
public static final String TIKA_AUTO = "TikaAuto";
public static final String TEXT_MINING = "TextMining";
public static final String TARGET_MIMETYPE = "--targetMimetype=";
public static final String TARGET_ENCODING = "--targetEncoding=";
public static final String INCLUDE_CONTENTS = "--includeContents";
public static final String NOT_EXTRACT_BOOKMARKS_TEXT = "--notExtractBookmarksText";
public static final String CSV = "csv";
public static final String DOC = "doc";
public static final String DOCX = "docx";
public static final String HTML = "html";
public static final String MSG = "msg";
public static final String PDF = "pdf";
public static final String PPTX = "pptx";
public static final String TXT = "txt";
public static final String XHTML = "xhtml";
public static final String XSLX = "xslx";
public static final String XML = "xml";
public static final String ZIP = "zip";
public static final Parser packageParser = new PackageParser();
public static final Parser pdfParser = new PDFParser();
public static final Parser officeParser = new OfficeParser();
public final Parser autoDetectParser;
public static final Parser ooXmlParser = new OOXMLParser();
public static final Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
public final PDFParserConfig pdfParserConfig = new PDFParserConfig();
public static final DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
{
private final List<String> disabledMediaTypes = ImmutableList.of(MIMETYPE_IMAGE_JPEG,
MIMETYPE_IMAGE_TIFF, MIMETYPE_IMAGE_PNG);
@Override
public boolean select(Metadata metadata)
{
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (contentType == null || contentType.equals("") || disabledMediaTypes == null)
{
return true;
}
return !disabledMediaTypes.contains(contentType);
}
};
public Tika() throws TikaException, IOException, SAXException
{
TikaConfig tikaConfig = readTikaConfig();
autoDetectParser = new AutoDetectParser(tikaConfig);
}
public static TikaConfig readTikaConfig(Logger logger)
{
try
{
return readTikaConfig();
}
catch (Exception e)
{
logger.error("Failed to read tika-config.xml", e);
return null;
}
}
private static TikaConfig readTikaConfig() throws TikaException, IOException, SAXException
{
ClassLoader classLoader = Tika.class.getClassLoader();
URL tikaConfigXml = classLoader.getResource("tika-config.xml");
return new TikaConfig(tikaConfigXml);
}
// Extracts parameters form args
public void transform(Parser parser, DocumentSelector documentSelector, String[] args)
{
String transform = null;
String targetMimetype = null;
String targetEncoding = null;
String sourceFilename = null;
String targetFilename = null;
Boolean includeContents = null;
Boolean notExtractBookmarksText = null;
for (String arg : args)
{
if (arg.startsWith("--"))
{
if (INCLUDE_CONTENTS.startsWith(arg))
{
getValue(arg, false, includeContents, INCLUDE_CONTENTS);
includeContents = true;
}
else if (arg.startsWith(TARGET_ENCODING))
{
targetEncoding = getValue(arg, true, targetEncoding, TARGET_ENCODING);
}
else if (arg.startsWith(TARGET_MIMETYPE))
{
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
}
else if (arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT))
{
getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT);
notExtractBookmarksText = true;
}
else
{
throw new IllegalArgumentException("Unexpected argument " + arg);
}
}
else
{
if (transform == null)
{
transform = arg;
}
else if (sourceFilename == null)
{
sourceFilename = arg;
}
else if (targetFilename == null)
{
targetFilename = arg;
}
else
{
throw new IllegalArgumentException("Unexpected argument " + arg);
}
}
}
if (targetFilename == null)
{
throw new IllegalArgumentException("Missing arguments");
}
includeContents = includeContents == null ? false : includeContents;
notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText;
transform(parser, documentSelector, includeContents, notExtractBookmarksText, sourceFilename,
targetFilename, targetMimetype, targetEncoding);
}
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
{
if (value != null)
{
throw new IllegalArgumentException("Duplicate " + optionName);
}
String stringValue = arg.substring(optionName.length()).trim();
if (!valueExpected && stringValue.length() > 0)
{
throw new IllegalArgumentException("Unexpected value with " + optionName);
}
if (valueExpected && stringValue.length() == 0)
{
throw new IllegalArgumentException("Expected value with " + optionName);
}
return stringValue;
}
private void transform(Parser parser, DocumentSelector documentSelector,
Boolean includeContents,
Boolean notExtractBookmarksText,
String sourceFilename,
String targetFilename, String targetMimetype, String targetEncoding)
{
try (InputStream is = new BufferedInputStream(new FileInputStream(sourceFilename));
OutputStream os = new FileOutputStream(targetFilename);
Writer ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding)))
{
Metadata metadata = new Metadata();
ParseContext context = buildParseContext(documentSelector, includeContents,
notExtractBookmarksText);
ContentHandler handler = getContentHandler(targetMimetype, ow);
parser.parse(is, handler, metadata, context);
}
catch (SAXException | TikaException | IOException e)
{
throw new IllegalStateException(e.getMessage(), e);
}
}
private ContentHandler getContentHandler(String targetMimetype, Writer output)
{
try
{
ContentHandler handler;
if (MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
{
handler = new BodyContentHandler(output);
}
else
{
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler transformerHandler;
transformerHandler = factory.newTransformerHandler();
transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
transformerHandler.setResult(new StreamResult(output));
handler = transformerHandler;
if (MIMETYPE_HTML.equals(targetMimetype))
{
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, HTML);
return new ExpandedTitleContentHandler(transformerHandler);
}
else if (MIMETYPE_XHTML.equals(targetMimetype) ||
MIMETYPE_XML.equals(targetMimetype))
{
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, XML);
}
else if (MIMETYPE_TEXT_CSV.equals(targetMimetype))
{
handler = new CsvContentHandler(output);
}
else
{
throw new IllegalArgumentException("Invalid target mimetype " + targetMimetype);
}
}
return handler;
}
catch (TransformerConfigurationException e)
{
throw new IllegalStateException(e.getMessage(), e);
}
}
/**
* A wrapper around the normal Tika BodyContentHandler for CSV rather encoding than tab separated.
*/
protected static class CsvContentHandler extends BodyContentHandler
{
private static final char[] comma = new char[]{','};
private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+");
private boolean inCell = false;
private boolean needsComma = false;
protected CsvContentHandler(Writer output)
{
super(output);
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException
{
if (length == 1 && ch[0] == '\t')
{
// Ignore tabs, as they mess up the CSV output
}
else
{
super.ignorableWhitespace(ch, start, length);
}
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException
{
if (inCell)
{
StringBuffer t = new StringBuffer(new String(ch, start, length));
// Quote if not all numbers
if (all_nums.matcher(t).matches())
{
super.characters(ch, start, length);
}
else
{
for (int i = t.length() - 1; i >= 0; i--)
{
if (t.charAt(i) == '\"')
{
// Double up double quotes
t.insert(i, '\"');
i--;
}
}
t.insert(0, '\"');
t.append('\"');
char[] c = t.toString().toCharArray();
super.characters(c, 0, c.length);
}
}
else
{
super.characters(ch, start, length);
}
}
@Override
public void startElement(String uri, String localName, String name,
Attributes atts) throws SAXException
{
if (localName.equals("td"))
{
inCell = true;
if (needsComma)
{
super.characters(comma, 0, 1);
needsComma = true;
}
}
else
{
super.startElement(uri, localName, name, atts);
}
}
@Override
public void endElement(String uri, String localName, String name)
throws SAXException
{
if (localName.equals("td"))
{
needsComma = true;
inCell = false;
}
else
{
if (localName.equals("tr"))
{
needsComma = false;
}
super.endElement(uri, localName, name);
}
}
}
private ParseContext buildParseContext(DocumentSelector documentSelector,
Boolean includeContents, Boolean notExtractBookmarksText)
{
ParseContext context = new ParseContext();
if (documentSelector != null)
{
context.set(DocumentSelector.class, documentSelector);
}
if (notExtractBookmarksText.equals(true))
{
pdfParserConfig.setExtractBookmarksText(false);
// pdfParserConfig is set to override default settings
context.set(PDFParserConfig.class, pdfParserConfig);
}
// If Archive transform
if (includeContents != null)
{
context.set(Parser.class, includeContents ? autoDetectParser : new EmptyParser());
}
return context;
}
}

View File

@@ -0,0 +1,40 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transform.tika.transformers;
import org.apache.tika.parser.Parser;
import org.springframework.stereotype.Component;
@Component
public class TikaAutoTransformer extends GenericTikaTransformer
{
@Override
protected Parser getParser()
{
return tika.autoDetectParser;
}
}

View File

@@ -1,876 +0,0 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.executors;
import com.google.common.collect.ImmutableList;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.parser.pkg.PackageParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.slf4j.Logger;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URL;
import java.util.List;
import java.util.regex.Pattern;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_PNG;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_TIFF;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_CSV;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML;
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
/**
* Stripped down command line Tika transformers. Not actually run as a separate process, but the code fits the patten
* used by transformers that do.
* <pre>
*
* Archive 0 ms
* 1) cpio html [100] unlimited
* 2) cpio txt [50] unlimited
* 3) cpio xhtml [100] unlimited
* 4) cpio xml [100] unlimited
* 5) jar html [100] unlimited
* 6) jar txt [50] unlimited
* 7) jar xhtml [100] unlimited
* 8) jar xml [100] unlimited
* 9) tar html [100] unlimited
* 10) tar txt [50] unlimited
* 11) tar xhtml [100] unlimited
* 12) tar xml [100] unlimited
* 13) zip html [100] unlimited
* 14) zip txt [50] unlimited
* 15) zip xhtml [100] unlimited
* 16) zip xml [100] unlimited
* PdfBox 0 ms
* 1) pdf html [110] unlimited
* 2) pdf txt [50] 25 MB
* 3) pdf xhtml [110] unlimited
* 4) pdf xml [110] unlimited
* OutlookMsg 0 ms
* 1) msg html [125] unlimited
* 2) msg txt [125] unlimited
* 3) msg xhtml [125] unlimited
* 4) msg xml [125] unlimited
* PdfBox 0 ms
* 1) pdf html [110] unlimited
* 2) pdf txt [50] 25 MB
* 3) pdf xhtml [110] unlimited
* 4) pdf xml [110] unlimited
* Office 0 ms
* 1) doc html [130] unlimited
* 2) doc txt [130] unlimited
* 3) doc xhtml [130] unlimited
* 4) doc xml [130] unlimited
* 5) mpp html [130] unlimited
* 6) mpp txt [130] unlimited
* 7) mpp xhtml [130] unlimited
* 8) mpp xml [130] unlimited
* 9) msg html [130] unlimited
* 10) msg txt [130] unlimited
* 11) msg xhtml [130] unlimited
* 12) msg xml [130] unlimited
* 13) ppt html [130] unlimited
* 14) ppt txt [130] unlimited
* 15) ppt xhtml [130] unlimited
* 16) ppt xml [130] unlimited
* 17) vsd html [130] unlimited
* 18) vsd txt [130] unlimited
* 19) vsd xhtml [130] unlimited
* 20) vsd xml [130] unlimited
* Poi 0 ms
* 1) xls csv [130] unlimited
* 2) xls html [130] unlimited
* 3) xls txt [130] unlimited
* 4) xls xhtml [130] unlimited
* 5) xls xml [130] unlimited
* 6) xlsx csv [130] unlimited
* 7) xlsx html [130] unlimited
* 8) xlsx txt [130] unlimited
* 9) xlsx xhtml [130] unlimited
* 10) xlsx xml [130] unlimited
* OOXML 0 ms
* 1) docm html [130] unlimited
* 2) docm txt [130] unlimited
* 3) docm xhtml [130] unlimited
* 4) docm xml [130] unlimited
* 5) docx html [130] unlimited
* 6) docx txt [130] unlimited
* 7) docx xhtml [130] unlimited
* 8) docx xml [130] unlimited
* 9) dotm html [130] unlimited
* 10) dotm txt [130] unlimited
* 11) dotm xhtml [130] unlimited
* 12) dotm xml [130] unlimited
* 13) dotx html [130] unlimited
* 14) dotx txt [130] unlimited
* 15) dotx xhtml [130] unlimited
* 16) dotx xml [130] unlimited
* 17) potm html [130] unlimited
* 18) potm txt [130] unlimited
* 19) potm xhtml [130] unlimited
* 20) potm xml [130] unlimited
* 21) potx html [130] unlimited
* 22) potx txt [130] unlimited
* 23) potx xhtml [130] unlimited
* 24) potx xml [130] unlimited
* 25) ppam html [130] unlimited
* 26) ppam txt [130] unlimited
* 27) ppam xhtml [130] unlimited
* 28) ppam xml [130] unlimited
* 29) ppsm html [130] unlimited
* 30) ppsm txt [130] unlimited
* 31) ppsm xhtml [130] unlimited
* 32) ppsm xml [130] unlimited
* 33) ppsx html [130] unlimited
* 34) ppsx txt [130] unlimited
* 35) ppsx xhtml [130] unlimited
* 36) ppsx xml [130] unlimited
* 37) pptm html [130] unlimited
* 38) pptm txt [130] unlimited
* 39) pptm xhtml [130] unlimited
* 40) pptm xml [130] unlimited
* 41) pptx html [130] unlimited
* 42) pptx txt [130] unlimited
* 43) pptx xhtml [130] unlimited
* 44) pptx xml [130] unlimited
* 45) sldm html [130] unlimited
* 46) sldm txt [130] unlimited
* 47) sldm xhtml [130] unlimited
* 48) sldm xml [130] unlimited
* 49) sldx html [130] unlimited
* 50) sldx txt [130] unlimited
* 51) sldx xhtml [130] unlimited
* 52) sldx xml [130] unlimited
* 53) xlam html [130] unlimited
* 54) xlam txt [130] unlimited
* 55) xlam xhtml [130] unlimited
* 56) xlam xml [130] unlimited
* 57) xlsb html [130] unlimited
* 58) xlsb txt [130] unlimited
* 59) xlsb xhtml [130] unlimited
* 60) xlsb xml [130] unlimited
* 61) xlsm html [130] unlimited
* 62) xlsm txt [130] unlimited
* 63) xlsm xhtml [130] unlimited
* 64) xlsm xml [130] unlimited
* 65) xlsx html [130] unlimited
* 66) xlsx txt [130] unlimited
* 67) xlsx xhtml [130] unlimited
* 68) xlsx xml [130] unlimited
* 69) xltm html [130] unlimited
* 70) xltm txt [130] unlimited
* 71) xltm xhtml [130] unlimited
* 72) xltm xml [130] unlimited
* 73) xltx html [130] unlimited
* 74) xltx txt [130] unlimited
* 75) xltx xhtml [130] unlimited
* 76) xltx xml [130] unlimited
* TikaAuto 0 ms
* 1) cdf html [120] unlimited
* 2) cdf txt [120] unlimited
* 3) cdf xhtml [120] unlimited
* 4) cdf xml [120] unlimited
* 5) cpio html [120] unlimited
* 6) cpio txt [120] unlimited
* 7) cpio xhtml [120] unlimited
* 8) cpio xml [120] unlimited
* 9) doc html [120] unlimited
* 10) doc txt [120] unlimited
* 11) doc xhtml [120] unlimited
* 12) doc xml [120] unlimited
* 13) docm html [120] unlimited
* 14) docm txt [120] unlimited
* 15) docm xhtml [120] unlimited
* 16) docm xml [120] unlimited
* 17) docx html [120] unlimited
* 18) docx txt [120] unlimited
* 19) docx xhtml [120] unlimited
* 20) docx xml [120] unlimited
* 21) dotm html [120] unlimited
* 22) dotm txt [120] unlimited
* 23) dotm xhtml [120] unlimited
* 24) dotm xml [120] unlimited
* 25) dotx html [120] unlimited
* 26) dotx txt [120] unlimited
* 27) dotx xhtml [120] unlimited
* 28) dotx xml [120] unlimited
* 29) gzip html [120] unlimited
* 30) gzip txt [120] unlimited
* 31) gzip xhtml [120] unlimited
* 32) gzip xml [120] unlimited
* 33) hdf html [120] unlimited
* 34) hdf txt [120] unlimited
* 35) hdf xhtml [120] unlimited
* 36) hdf xml [120] unlimited
* 37) html html [120] unlimited
* 38) html txt [120] unlimited
* 39) html xhtml [120] unlimited
* 40) html xml [120] unlimited
* 41) jar html [120] unlimited
* 42) jar txt [120] unlimited
* 43) jar xhtml [120] unlimited
* 44) jar xml [120] unlimited
* 45) java html [120] unlimited
* 46) java txt [120] unlimited
* 47) java xhtml [120] unlimited
* 48) java xml [120] unlimited
* 49) key html [120] unlimited
* 50) key txt [120] unlimited
* 51) key xhtml [120] unlimited
* 52) key xml [120] unlimited
* 53) mpp html [120] unlimited
* 54) mpp txt [120] unlimited
* 55) mpp xhtml [120] unlimited
* 56) mpp xml [120] unlimited
* 57) numbers html [120] unlimited
* 58) numbers txt [120] unlimited
* 59) numbers xhtml [120] unlimited
* 60) numbers xml [120] unlimited
* 61) odc html [120] unlimited
* 62) odc txt [120] unlimited
* 63) odc xhtml [120] unlimited
* 64) odc xml [120] unlimited
* 65) odi html [120] unlimited
* 66) odi txt [120] unlimited
* 67) odi xhtml [120] unlimited
* 68) odi xml [120] unlimited
* 69) odm html [120] unlimited
* 70) odm txt [120] unlimited
* 71) odm xhtml [120] unlimited
* 72) odm xml [120] unlimited
* 73) odp html [120] unlimited
* 74) odp txt [120] unlimited
* 75) odp xhtml [120] unlimited
* 76) odp xml [120] unlimited
* 77) ods html [120] unlimited
* 78) ods txt [120] unlimited
* 79) ods xhtml [120] unlimited
* 80) ods xml [120] unlimited
* 81) odt html [120] unlimited
* 82) odt txt [120] unlimited
* 83) odt xhtml [120] unlimited
* 84) odt xml [120] unlimited
* 85) ogx html [120] unlimited
* 86) ogx txt [120] unlimited
* 87) ogx xhtml [120] unlimited
* 88) ogx xml [120] unlimited
* 89) oth html [120] unlimited
* 90) oth txt [120] unlimited
* 91) oth xhtml [120] unlimited
* 92) oth xml [120] unlimited
* 93) otp html [120] unlimited
* 94) otp txt [120] unlimited
* 95) otp xhtml [120] unlimited
* 96) otp xml [120] unlimited
* 97) ots html [120] unlimited
* 98) ots txt [120] unlimited
* 99) ots xhtml [120] unlimited
* 100) ots xml [120] unlimited
* 101) ott html [120] unlimited
* 102) ott txt [120] unlimited
* 103) ott xhtml [120] unlimited
* 104) ott xml [120] unlimited
* 105) pages html [120] unlimited
* 106) pages txt [120] unlimited
* 107) pages xhtml [120] unlimited
* 108) pages xml [120] unlimited
* 109) pdf html [120] unlimited
* 110) pdf txt [120] 25 MB
* 111) pdf xhtml [120] unlimited
* 112) pdf xml [120] unlimited
* 113) potm html [120] unlimited
* 114) potm txt [120] unlimited
* 115) potm xhtml [120] unlimited
* 116) potm xml [120] unlimited
* 117) potx html [120] unlimited
* 118) potx txt [120] unlimited
* 119) potx xhtml [120] unlimited
* 120) potx xml [120] unlimited
* 121) ppam html [120] unlimited
* 122) ppam txt [120] unlimited
* 123) ppam xhtml [120] unlimited
* 124) ppam xml [120] unlimited
* 125) ppsm html [120] unlimited
* 126) ppsm txt [120] unlimited
* 127) ppsm xhtml [120] unlimited
* 128) ppsm xml [120] unlimited
* 129) ppsx html [120] unlimited
* 130) ppsx txt [120] unlimited
* 131) ppsx xhtml [120] unlimited
* 132) ppsx xml [120] unlimited
* 133) ppt html [120] unlimited
* 134) ppt txt [120] unlimited
* 135) ppt xhtml [120] unlimited
* 136) ppt xml [120] unlimited
* 137) pptm html [120] unlimited
* 138) pptm txt [120] unlimited
* 139) pptm xhtml [120] unlimited
* 140) pptm xml [120] unlimited
* 141) pptx html [120] unlimited
* 142) pptx txt [120] unlimited
* 143) pptx xhtml [120] unlimited
* 144) pptx xml [120] unlimited
* 145) rar html [120] unlimited
* 146) rar txt [120] unlimited
* 147) rar xhtml [120] unlimited
* 148) rar xml [120] unlimited
* 149) rss html [120] unlimited
* 150) rss txt [120] unlimited
* 151) rss xhtml [120] unlimited
* 152) rss xml [120] unlimited
* 153) rtf html [120] unlimited
* 154) rtf txt [120] unlimited
* 155) rtf xhtml [120] unlimited
* 156) rtf xml [120] unlimited
* 157) sldm html [120] unlimited
* 158) sldm txt [120] unlimited
* 159) sldm xhtml [120] unlimited
* 160) sldm xml [120] unlimited
* 161) sldx html [120] unlimited
* 162) sldx txt [120] unlimited
* 163) sldx xhtml [120] unlimited
* 164) sldx xml [120] unlimited
* 165) sxw html [120] unlimited
* 166) sxw txt [120] unlimited
* 167) sxw xhtml [120] unlimited
* 168) sxw xml [120] unlimited
* 169) txt html [120] unlimited
* 170) txt txt [120] unlimited
* 171) txt xhtml [120] unlimited
* 172) txt xml [120] unlimited
* 173) vsd html [120] unlimited
* 174) vsd txt [120] unlimited
* 175) vsd xhtml [120] unlimited
* 176) vsd xml [120] unlimited
* 177) xhtml html [120] unlimited
* 178) xhtml txt [120] unlimited
* 179) xhtml xhtml [120] unlimited
* 180) xhtml xml [120] unlimited
* 181) xlam html [120] unlimited
* 182) xlam txt [120] unlimited
* 183) xlam xhtml [120] unlimited
* 184) xlam xml [120] unlimited
* 185) xls html [120] unlimited
* 186) xls txt [120] unlimited
* 187) xls xhtml [120] unlimited
* 188) xls xml [120] unlimited
* 189) xlsb html [120] unlimited
* 190) xlsb txt [120] unlimited
* 191) xlsb xhtml [120] unlimited
* 192) xlsb xml [120] unlimited
* 193) xlsm html [120] unlimited
* 194) xlsm txt [120] unlimited
* 195) xlsm xhtml [120] unlimited
* 196) xlsm xml [120] unlimited
* 197) xlsx html [120] unlimited
* 198) xlsx txt [120] unlimited
* 199) xlsx xhtml [120] unlimited
* 200) xlsx xml [120] unlimited
* 201) xltm html [120] unlimited
* 202) xltm txt [120] unlimited
* 203) xltm xhtml [120] unlimited
* 204) xltm xml [120] unlimited
* 205) xltx html [120] unlimited
* 206) xltx txt [120] unlimited
* 207) xltx xhtml [120] unlimited
* 208) xltx xml [120] unlimited
* 209) xml html [120] unlimited
* 210) xml txt [120] unlimited
* 211) xml xhtml [120] unlimited
* 212) xml xml [120] unlimited
* 213) z html [120] unlimited
* 214) z txt [120] unlimited
* 215) z xhtml [120] unlimited
* 216) z xml [120] unlimited
* TextMining 0 ms
* 1) doc html [130] unlimited
* 2) doc txt [50] unlimited
* 3) doc xhtml [130] unlimited
* 4) doc xml [130] unlimited
* </pre>
*/
public class Tika
{
public static final String ARCHIVE = "Archive";
public static final String OUTLOOK_MSG = "OutlookMsg";
public static final String PDF_BOX = "PdfBox";
public static final String POI_OFFICE = "Office";
public static final String POI = "Poi";
public static final String POI_OO_XML = "OOXML";
public static final String TIKA_AUTO = "TikaAuto";
public static final String TEXT_MINING = "TextMining";
public static final List<String> TRANSFORM_NAMES = ImmutableList.of(
ARCHIVE, OUTLOOK_MSG, PDF_BOX, POI_OFFICE, POI, POI_OO_XML, TIKA_AUTO, TEXT_MINING);
public static final String TARGET_MIMETYPE = "--targetMimetype=";
public static final String TARGET_ENCODING = "--targetEncoding=";
public static final String INCLUDE_CONTENTS = "--includeContents";
public static final String NOT_EXTRACT_BOOKMARKS_TEXT = "--notExtractBookmarksText";
public static final String CSV = "csv";
public static final String DOC = "doc";
public static final String DOCX = "docx";
public static final String HTML = "html";
public static final String MSG = "msg";
public static final String PDF = "pdf";
public static final String PPTX = "pptx";
public static final String TXT = "txt";
public static final String XHTML = "xhtml";
public static final String XSLX = "xslx";
public static final String XML = "xml";
public static final String ZIP = "zip";
private final Parser packageParser = new PackageParser();
private final Parser pdfParser = new PDFParser();
private final Parser officeParser = new OfficeParser();
private final Parser autoDetectParser;
private final Parser ooXmlParser = new OOXMLParser();
private final Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
private final PDFParserConfig pdfParserConfig = new PDFParserConfig();
public static final DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
{
private final List<String> disabledMediaTypes = ImmutableList.of(MIMETYPE_IMAGE_JPEG,
MIMETYPE_IMAGE_TIFF, MIMETYPE_IMAGE_PNG);
@Override
public boolean select(Metadata metadata)
{
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (contentType == null || contentType.equals("") || disabledMediaTypes == null)
{
return true;
}
return !disabledMediaTypes.contains(contentType);
}
};
public Tika() throws TikaException, IOException, SAXException
{
TikaConfig tikaConfig = readTikaConfig();
autoDetectParser = new AutoDetectParser(tikaConfig);
}
public static TikaConfig readTikaConfig(Logger logger)
{
try
{
return readTikaConfig();
}
catch (Exception e)
{
logger.error("Failed to read tika-config.xml", e);
return null;
}
}
private static TikaConfig readTikaConfig() throws TikaException, IOException, SAXException
{
ClassLoader classLoader = Tika.class.getClassLoader();
URL tikaConfigXml = classLoader.getResource("tika-config.xml");
return new TikaConfig(tikaConfigXml);
}
// Method included for developer testing
public static void main(String[] args)
{
long start = System.currentTimeMillis();
try
{
new Tika().transform(args);
}
catch (IllegalArgumentException e)
{
System.err.println("ERROR " + e.getMessage());
System.exit(-1);
}
catch (IllegalStateException | TikaException | IOException | SAXException e)
{
System.err.println("ERROR " + e.getMessage());
e.printStackTrace();
System.exit(-2);
}
System.out.println("Finished in " + (System.currentTimeMillis() - start) + "ms");
}
// Extracts parameters form args
public void transform(String[] args)
{
String transform = null;
String targetMimetype = null;
String targetEncoding = null;
String sourceFilename = null;
String targetFilename = null;
Boolean includeContents = null;
Boolean notExtractBookmarksText = null;
for (String arg : args)
{
if (arg.startsWith("--"))
{
if (INCLUDE_CONTENTS.startsWith(arg))
{
getValue(arg, false, includeContents, INCLUDE_CONTENTS);
includeContents = true;
}
else if (arg.startsWith(TARGET_ENCODING))
{
targetEncoding = getValue(arg, true, targetEncoding, TARGET_ENCODING);
}
else if (arg.startsWith(TARGET_MIMETYPE))
{
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
}
else if (arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT))
{
getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT);
notExtractBookmarksText = true;
}
else
{
throw new IllegalArgumentException("Unexpected argument " + arg);
}
}
else
{
if (transform == null)
{
transform = arg;
}
else if (sourceFilename == null)
{
sourceFilename = arg;
}
else if (targetFilename == null)
{
targetFilename = arg;
}
else
{
throw new IllegalArgumentException("Unexpected argument " + arg);
}
}
}
if (targetFilename == null)
{
throw new IllegalArgumentException("Missing arguments");
}
includeContents = includeContents == null ? false : includeContents;
notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText;
transform(transform, includeContents, notExtractBookmarksText, sourceFilename,
targetFilename, targetMimetype, targetEncoding);
}
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
{
if (value != null)
{
throw new IllegalArgumentException("Duplicate " + optionName);
}
String stringValue = arg.substring(optionName.length()).trim();
if (!valueExpected && stringValue.length() > 0)
{
throw new IllegalArgumentException("Unexpected value with " + optionName);
}
if (valueExpected && stringValue.length() == 0)
{
throw new IllegalArgumentException("Expected value with " + optionName);
}
return stringValue;
}
// Adds transform specific values such as parser and documentSelector.
private void transform(String transform, Boolean includeContents,
Boolean notExtractBookmarksText,
String sourceFilename,
String targetFilename, String targetMimetype, String targetEncoding)
{
Parser parser = null;
DocumentSelector documentSelector = null;
switch (transform)
{
case ARCHIVE:
parser = packageParser;
break;
case OUTLOOK_MSG:
case POI_OFFICE:
case TEXT_MINING:
parser = officeParser;
break;
case PDF_BOX:
parser = pdfParser;
documentSelector = pdfBoxEmbededDocumentSelector;
break;
case POI:
parser = tikaOfficeDetectParser;
break;
case POI_OO_XML:
parser = ooXmlParser;
break;
case TIKA_AUTO:
parser = autoDetectParser;
break;
}
transform(parser, documentSelector, includeContents, notExtractBookmarksText,
sourceFilename, targetFilename, targetMimetype, targetEncoding);
}
private void transform(Parser parser, DocumentSelector documentSelector,
Boolean includeContents,
Boolean notExtractBookmarksText,
String sourceFilename,
String targetFilename, String targetMimetype, String targetEncoding)
{
try (InputStream is = new BufferedInputStream(new FileInputStream(sourceFilename));
OutputStream os = new FileOutputStream(targetFilename);
Writer ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding)))
{
Metadata metadata = new Metadata();
ParseContext context = buildParseContext(documentSelector, includeContents,
notExtractBookmarksText);
ContentHandler handler = getContentHandler(targetMimetype, ow);
parser.parse(is, handler, metadata, context);
}
catch (SAXException | TikaException | IOException e)
{
throw new IllegalStateException(e.getMessage(), e);
}
}
private ContentHandler getContentHandler(String targetMimetype, Writer output)
{
try
{
ContentHandler handler;
if (MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
{
handler = new BodyContentHandler(output);
}
else
{
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler transformerHandler;
transformerHandler = factory.newTransformerHandler();
transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
transformerHandler.setResult(new StreamResult(output));
handler = transformerHandler;
if (MIMETYPE_HTML.equals(targetMimetype))
{
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, HTML);
return new ExpandedTitleContentHandler(transformerHandler);
}
else if (MIMETYPE_XHTML.equals(targetMimetype) ||
MIMETYPE_XML.equals(targetMimetype))
{
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, XML);
}
else if (MIMETYPE_TEXT_CSV.equals(targetMimetype))
{
handler = new CsvContentHandler(output);
}
else
{
throw new IllegalArgumentException("Invalid target mimetype " + targetMimetype);
}
}
return handler;
}
catch (TransformerConfigurationException e)
{
throw new IllegalStateException(e.getMessage(), e);
}
}
/**
* A wrapper around the normal Tika BodyContentHandler for CSV rather encoding than tab separated.
*/
protected static class CsvContentHandler extends BodyContentHandler
{
private static final char[] comma = new char[]{','};
private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+");
private boolean inCell = false;
private boolean needsComma = false;
protected CsvContentHandler(Writer output)
{
super(output);
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException
{
if (length == 1 && ch[0] == '\t')
{
// Ignore tabs, as they mess up the CSV output
}
else
{
super.ignorableWhitespace(ch, start, length);
}
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException
{
if (inCell)
{
StringBuffer t = new StringBuffer(new String(ch, start, length));
// Quote if not all numbers
if (all_nums.matcher(t).matches())
{
super.characters(ch, start, length);
}
else
{
for (int i = t.length() - 1; i >= 0; i--)
{
if (t.charAt(i) == '\"')
{
// Double up double quotes
t.insert(i, '\"');
i--;
}
}
t.insert(0, '\"');
t.append('\"');
char[] c = t.toString().toCharArray();
super.characters(c, 0, c.length);
}
}
else
{
super.characters(ch, start, length);
}
}
@Override
public void startElement(String uri, String localName, String name,
Attributes atts) throws SAXException
{
if (localName.equals("td"))
{
inCell = true;
if (needsComma)
{
super.characters(comma, 0, 1);
needsComma = true;
}
}
else
{
super.startElement(uri, localName, name, atts);
}
}
@Override
public void endElement(String uri, String localName, String name)
throws SAXException
{
if (localName.equals("td"))
{
needsComma = true;
inCell = false;
}
else
{
if (localName.equals("tr"))
{
needsComma = false;
}
super.endElement(uri, localName, name);
}
}
}
private ParseContext buildParseContext(DocumentSelector documentSelector,
Boolean includeContents, Boolean notExtractBookmarksText)
{
ParseContext context = new ParseContext();
if (documentSelector != null)
{
context.set(DocumentSelector.class, documentSelector);
}
if (notExtractBookmarksText.equals(true))
{
pdfParserConfig.setExtractBookmarksText(false);
// pdfParserConfig is set to override default settings
context.set(PDFParserConfig.class, pdfParserConfig);
}
// If Archive transform
if (includeContents != null)
{
context.set(Parser.class, includeContents ? autoDetectParser : new EmptyParser());
}
return context;
}
}

View File

@@ -1,204 +0,0 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.executors;
import com.google.common.collect.ImmutableMap;
import org.alfresco.transformer.logging.LogEntry;
import org.alfresco.transformer.metadataExtractors.AbstractTikaMetadataExtractor;
import org.alfresco.transformer.metadataExtractors.DWGMetadataExtractor;
import org.alfresco.transformer.metadataExtractors.MP3MetadataExtractor;
import org.alfresco.transformer.metadataExtractors.MailMetadataExtractor;
import org.alfresco.transformer.metadataExtractors.OfficeMetadataExtractor;
import org.alfresco.transformer.metadataExtractors.OpenDocumentMetadataExtractor;
import org.alfresco.transformer.metadataExtractors.PdfBoxMetadataExtractor;
import org.alfresco.transformer.metadataExtractors.PoiMetadataExtractor;
import org.alfresco.transformer.metadataExtractors.TikaAudioMetadataExtractor;
import org.alfresco.transformer.metadataExtractors.TikaAutoMetadataExtractor;
import org.alfresco.transformer.metadataExtractors.IPTCMetadataExtractor;
import org.alfresco.transformer.util.RequestParamMap;
import org.apache.tika.exception.TikaException;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
import java.util.StringJoiner;
import static java.lang.Boolean.parseBoolean;
import static org.alfresco.transformer.executors.Tika.INCLUDE_CONTENTS;
import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING;
import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE;
/**
* JavaExecutor implementation for running TIKA transformations. It loads the
* transformation logic in the same JVM (check {@link Tika}).
*/
public class TikaJavaExecutor implements JavaExecutor
{
private boolean notExtractBookmarksTextDefault;
private static final String ID = "tika";
public static final String LICENCE =
"This transformer uses Tika from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt\n" +
"This transformer uses ExifTool by Phil Harvey. See license at https://exiftool.org/#license. or in /Perl-Artistic-License.txt";
private final Tika tika;
private final Map<String, AbstractTikaMetadataExtractor> metadataExtractor = ImmutableMap
.<String, AbstractTikaMetadataExtractor>builder()
.put("DWGMetadataExtractor", new DWGMetadataExtractor())
.put("MailMetadataExtractor", new MailMetadataExtractor())
.put("MP3MetadataExtractor", new MP3MetadataExtractor())
.put("OfficeMetadataExtractor", new OfficeMetadataExtractor())
.put("OpenDocumentMetadataExtractor", new OpenDocumentMetadataExtractor())
.put("PdfBoxMetadataExtractor", new PdfBoxMetadataExtractor())
.put("PoiMetadataExtractor", new PoiMetadataExtractor())
.put("TikaAudioMetadataExtractor", new TikaAudioMetadataExtractor())
.put("TikaAutoMetadataExtractor", new TikaAutoMetadataExtractor())
.put("IPTCMetadataExtractor", new IPTCMetadataExtractor())
.build();
private final Map<String, AbstractTikaMetadataExtractor> metadataEmbedder = ImmutableMap
.<String, AbstractTikaMetadataExtractor>builder()
.put("SamplePoiMetadataEmbedder", new PoiMetadataExtractor())
.build();
public TikaJavaExecutor(boolean notExtractBookmarksTextDefault)
{
this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault;
try
{
tika = new Tika();
}
catch (SAXException | IOException | TikaException e)
{
throw new RuntimeException("Unable to instantiate Tika: " + e.getMessage());
}
}
public TikaJavaExecutor()
{
this(false);
}
@Override
public String getTransformerId()
{
return ID;
}
@Override
public void transform(String transformName, String sourceMimetype, String targetMimetype,
Map<String, String> transformOptions, File sourceFile, File targetFile)
throws Exception
{
final boolean includeContents = parseBoolean(
transformOptions.getOrDefault(RequestParamMap.INCLUDE_CONTENTS, "false"));
final boolean notExtractBookmarksText = parseBoolean(
transformOptions.getOrDefault(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT, String.valueOf(notExtractBookmarksTextDefault)));
final String targetEncoding = transformOptions.getOrDefault("targetEncoding", "UTF-8");
if(transformOptions.get(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT)==null && notExtractBookmarksTextDefault)
{
LoggerFactory.getLogger(TikaJavaExecutor.class).trace(
"notExtractBookmarksText default value has been overridden to {}",
notExtractBookmarksTextDefault);
}
call(sourceFile, targetFile, transformName,
includeContents ? INCLUDE_CONTENTS : null,
notExtractBookmarksText ? Tika.NOT_EXTRACT_BOOKMARKS_TEXT : null,
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + targetEncoding);
}
@Override
public void call(File sourceFile, File targetFile, String... args)
{
args = buildArgs(sourceFile, targetFile, args);
tika.transform(args);
}
private static String[] buildArgs(File sourceFile, File targetFile, String[] args)
{
ArrayList<String> methodArgs = new ArrayList<>(args.length + 2);
StringJoiner sj = new StringJoiner(" ");
for (String arg : args)
{
addArg(methodArgs, sj, arg);
}
addFileArg(methodArgs, sj, sourceFile);
addFileArg(methodArgs, sj, targetFile);
LogEntry.setOptions(sj.toString());
return methodArgs.toArray(new String[0]);
}
private static void addArg(ArrayList<String> methodArgs, StringJoiner sj, String arg)
{
if (arg != null)
{
sj.add(arg);
methodArgs.add(arg);
}
}
private static void addFileArg(ArrayList<String> methodArgs, StringJoiner sj, File arg)
{
if (arg != null)
{
String path = arg.getAbsolutePath();
int i = path.lastIndexOf('.');
String ext = i == -1 ? "???" : path.substring(i + 1);
sj.add(ext);
methodArgs.add(path);
}
}
public void extractMetadata(String transformName, String sourceMimetype, String targetMimetype,
Map<String, String> transformOptions, File sourceFile, File targetFile)
throws Exception
{
AbstractTikaMetadataExtractor metadataExtractor = this.metadataExtractor.get(transformName);
metadataExtractor.extractMetadata(sourceMimetype, transformOptions, sourceFile, targetFile);
}
/**
* @deprecated The content repository's TikaPoweredMetadataExtracter provides no non test implementations.
* This code exists in case there are custom implementations, that need to be converted to T-Engines.
* It is simply a copy and paste from the content repository and has received limited testing.
*/
@Override
@SuppressWarnings("deprecation" )
public void embedMetadata(String transformName, String sourceMimetype, String targetMimetype,
Map<String, String> transformOptions, File sourceFile, File targetFile)
throws Exception
{
AbstractTikaMetadataExtractor metadataExtractor = this.metadataEmbedder.get(transformName);
metadataExtractor.embedMetadata(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
}
}

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,14 +24,14 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.metadataExtractors;
package org.alfresco.transform.tika.metadataExtractors;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import org.junit.jupiter.api.Test;
public class IPTCMetadataExtractorTest {
public class IPTCMetadataExtractorTest
{
IPTCMetadataExtractor extractor = new IPTCMetadataExtractor();
@Test
@@ -42,7 +42,5 @@ public class IPTCMetadataExtractorTest {
"1901-02-01T00:00:00.000Z", "1901-02-01T00:00+00:00", "1901-02-01T00:00+00:00" };
assertArrayEquals(expected, extractor.iptcToIso8601DateStrings(testStrings));
}
}

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,7 +24,7 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.tika.parsers;
package org.alfresco.transform.tika.parsers;
import static org.junit.jupiter.api.Assertions.assertEquals;

View File

@@ -2,7 +2,7 @@
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2021 Alfresco Software Limited
* Copyright (C) 2005 - 2022 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
@@ -24,11 +24,11 @@
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package org.alfresco.transformer.executors;
package org.alfresco.transform.tika.transformers;
import static org.alfresco.transformer.executors.Tika.NOT_EXTRACT_BOOKMARKS_TEXT;
import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING;
import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE;
import static org.alfresco.transform.tika.transformers.Tika.NOT_EXTRACT_BOOKMARKS_TEXT;
import static org.alfresco.transform.tika.transformers.Tika.TARGET_ENCODING;
import static org.alfresco.transform.tika.transformers.Tika.TARGET_MIMETYPE;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.clearInvocations;
import static org.mockito.Mockito.lenient;
@@ -41,19 +41,33 @@ import java.io.File;
import java.util.HashMap;
import java.util.Map;
import org.apache.tika.parser.Parser;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.junit.jupiter.MockitoExtension;
@ExtendWith(MockitoExtension.class)
public class TikaJavaExecutorTest {
public class GenericTikaTransformerTest
{
private class TikaTestTransformer extends GenericTikaTransformer
{
@Override
protected Parser getParser()
{
return null;
}
TikaTestTransformer(boolean notExtractBookmarksTextDefault)
{
this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault;
}
};
@Test
public void testNotExtractBookmarkTextDefault() throws Exception
{
TikaJavaExecutor executorSpyDefaultTrue = spy(new TikaJavaExecutor(true));
TikaJavaExecutor executorSpyDefaultFalse = spy(new TikaJavaExecutor(false));
GenericTikaTransformer executorSpyDefaultTrue = spy(new TikaTestTransformer(true));
GenericTikaTransformer executorSpyDefaultFalse = spy(new TikaTestTransformer(false));
File mockSourceFile = mock(File.class);
File mockTargetFile = mock(File.class);
@@ -66,7 +80,7 @@ public class TikaJavaExecutorTest {
lenient().doNothing().when(executorSpyDefaultTrue).call(any(), any(), any(), any(), any(), any(), any());
lenient().doNothing().when(executorSpyDefaultFalse).call(any(), any(), any(), any(), any(), any(), any());
Map<String, String> transformOptions = new HashMap<String,String>();
Map<String, String> transformOptions = new HashMap<>();
// use empty transformOptions to test defaults
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions,