mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-08-14 17:58:27 +00:00
Save point: [skip ci]
* Beginnings of new t-base (using TransformEngine and CustomeTransformer, no need for a controller of Application in t-engine modules) * Using org.alfresco.transform.<module> package * Beginnings of new Tika t-engine
This commit is contained in:
@@ -20,12 +20,12 @@
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<artifactId>alfresco-transformer-base</artifactId>
|
||||
<artifactId>alfresco-t-engine-base</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<artifactId>alfresco-transformer-base</artifactId>
|
||||
<artifactId>alfresco-t-engine-base</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<classifier>tests</classifier>
|
||||
<type>test-jar</type>
|
||||
@@ -146,6 +146,9 @@
|
||||
<plugin>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<mainClass>org.alfresco.transform.base.Application</mainClass>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
|
@@ -1,77 +0,0 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import io.micrometer.core.instrument.MeterRegistry;
|
||||
import org.alfresco.transformer.executors.TikaJavaExecutor;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.actuate.autoconfigure.metrics.MeterRegistryCustomizer;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
|
||||
import org.springframework.boot.context.event.ApplicationReadyEvent;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.event.EventListener;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import static org.alfresco.transformer.logging.StandardMessages.LICENCE;
|
||||
|
||||
@SpringBootApplication
|
||||
@EnableAutoConfiguration(exclude = {DataSourceAutoConfiguration.class})
|
||||
public class Application
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(Application.class);
|
||||
|
||||
@Value("${container.name}")
|
||||
private String containerName;
|
||||
|
||||
@Bean
|
||||
MeterRegistryCustomizer<MeterRegistry> metricsCommonTags()
|
||||
{
|
||||
return registry -> registry.config().commonTags("containerName", containerName);
|
||||
}
|
||||
|
||||
public static void main(String[] args)
|
||||
{
|
||||
SpringApplication.run(Application.class, args);
|
||||
}
|
||||
|
||||
@EventListener(ApplicationReadyEvent.class)
|
||||
public void startup()
|
||||
{
|
||||
logger.info("--------------------------------------------------------------------------------------------------------------------------------------------------------------");
|
||||
Arrays.stream(LICENCE.split("\\n")).forEach(logger::info);
|
||||
Arrays.stream(TikaJavaExecutor.LICENCE.split("\\n")).forEach(logger::info);
|
||||
logger.info("--------------------------------------------------------------------------------------------------------------------------------------------------------------");
|
||||
|
||||
logger.info("Starting application components... Done");
|
||||
}
|
||||
}
|
@@ -1,111 +0,0 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import org.alfresco.transformer.executors.TikaJavaExecutor;
|
||||
import org.alfresco.transformer.probes.ProbeTestTransform;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Controller;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transformer.executors.Tika.PDF_BOX;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_PDF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
|
||||
/**
|
||||
* Controller for the Docker based Tika transformers.
|
||||
*
|
||||
* Status Codes:
|
||||
*
|
||||
* 200 Success
|
||||
* 400 Bad Request: Invalid target mimetype <mimetype>
|
||||
* 400 Bad Request: Request parameter <name> is missing (missing mandatory parameter)
|
||||
* 400 Bad Request: Request parameter <name> is of the wrong type
|
||||
* 400 Bad Request: Transformer exit code was not 0 (possible problem with the source file)
|
||||
* 400 Bad Request: The source filename was not supplied
|
||||
* 500 Internal Server Error: (no message with low level IO problems)
|
||||
* 500 Internal Server Error: The target filename was not supplied (should not happen as targetExtension is checked)
|
||||
* 500 Internal Server Error: Transformer version check exit code was not 0
|
||||
* 500 Internal Server Error: Transformer version check failed to create any output
|
||||
* 500 Internal Server Error: Could not read the target file
|
||||
* 500 Internal Server Error: The target filename was malformed (should not happen because of other checks)
|
||||
* 500 Internal Server Error: Transformer failed to create an output file (the exit code was 0, so there should be some content)
|
||||
* 500 Internal Server Error: Filename encoding error
|
||||
* 507 Insufficient Storage: Failed to store the source file
|
||||
*/
|
||||
@Controller
|
||||
public class TikaController extends AbstractTransformerController
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TikaController.class);
|
||||
|
||||
private TikaJavaExecutor javaExecutor;
|
||||
|
||||
public TikaController(@Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}") boolean notExtractBookmarksTextDefault)
|
||||
{
|
||||
javaExecutor= new TikaJavaExecutor(notExtractBookmarksTextDefault);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTransformerName()
|
||||
{
|
||||
return "Tika";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String version()
|
||||
{
|
||||
return "Tika available";
|
||||
}
|
||||
|
||||
@Override
|
||||
public ProbeTestTransform getProbeTestTransform()
|
||||
{
|
||||
// See the Javadoc on this method and Probes.md for the choice of these values.
|
||||
// the livenessPercentage is a little large as Tika does tend to suffer from slow transforms that class with a gc.
|
||||
return new ProbeTestTransform(this, "quick.pdf", "quick.txt",
|
||||
60, 16, 400, 10240, 60 * 30 + 1, 60 * 15 + 20)
|
||||
{
|
||||
@Override
|
||||
protected void executeTransformCommand(File sourceFile, File targetFile)
|
||||
{
|
||||
transformImpl(PDF_BOX, MIMETYPE_PDF, MIMETYPE_TEXT_PLAIN, new HashMap<>(), sourceFile, targetFile);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transformImpl(String transformName, String sourceMimetype, String targetMimetype,
|
||||
Map<String, String> transformOptions, File sourceFile, File targetFile)
|
||||
{
|
||||
javaExecutor.transformExtractOrEmbed(transformName, sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
|
||||
}
|
||||
}
|
@@ -24,30 +24,30 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import static java.nio.file.Files.readAllBytes;
|
||||
import static org.alfresco.transform.common.RequestParamMap.ENDPOINT_TRANSFORM;
|
||||
import static org.alfresco.transformer.executors.Tika.ARCHIVE;
|
||||
import static org.alfresco.transformer.executors.Tika.CSV;
|
||||
import static org.alfresco.transformer.executors.Tika.DOC;
|
||||
import static org.alfresco.transformer.executors.Tika.DOCX;
|
||||
import static org.alfresco.transformer.executors.Tika.HTML;
|
||||
import static org.alfresco.transformer.executors.Tika.MSG;
|
||||
import static org.alfresco.transformer.executors.Tika.OUTLOOK_MSG;
|
||||
import static org.alfresco.transformer.executors.Tika.PDF;
|
||||
import static org.alfresco.transformer.executors.Tika.PDF_BOX;
|
||||
import static org.alfresco.transformer.executors.Tika.POI;
|
||||
import static org.alfresco.transformer.executors.Tika.POI_OFFICE;
|
||||
import static org.alfresco.transformer.executors.Tika.POI_OO_XML;
|
||||
import static org.alfresco.transformer.executors.Tika.PPTX;
|
||||
import static org.alfresco.transformer.executors.Tika.TEXT_MINING;
|
||||
import static org.alfresco.transformer.executors.Tika.TIKA_AUTO;
|
||||
import static org.alfresco.transformer.executors.Tika.TXT;
|
||||
import static org.alfresco.transformer.executors.Tika.XHTML;
|
||||
import static org.alfresco.transformer.executors.Tika.XML;
|
||||
import static org.alfresco.transformer.executors.Tika.XSLX;
|
||||
import static org.alfresco.transformer.executors.Tika.ZIP;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.ARCHIVE;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.CSV;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.DOC;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.DOCX;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.HTML;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.MSG;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.OUTLOOK_MSG;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.PDF;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.PDF_BOX;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.POI;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.OFFICE;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.OOXML;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.PPTX;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TEXT_MINING;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TIKA_AUTO;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TXT;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.XHTML;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.XML;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.XSLX;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.ZIP;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_METADATA_EMBED;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_PRESENTATION;
|
||||
@@ -61,8 +61,8 @@ import static org.alfresco.transform.common.Mimetype.MIMETYPE_WORD;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_ZIP;
|
||||
import static org.alfresco.transformer.util.RequestParamMap.INCLUDE_CONTENTS;
|
||||
import static org.alfresco.transformer.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
import static org.alfresco.transform.base.util.RequestParamMap.INCLUDE_CONTENTS;
|
||||
import static org.alfresco.transform.base.util.RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
@@ -91,12 +91,14 @@ import java.util.UUID;
|
||||
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
|
||||
import org.alfresco.transform.base.AbstractTransformerControllerTest;
|
||||
import org.alfresco.transform.base.TransformController;
|
||||
import org.alfresco.transform.client.model.TransformReply;
|
||||
import org.alfresco.transform.client.model.TransformRequest;
|
||||
import org.alfresco.transformer.executors.RuntimeExec;
|
||||
import org.alfresco.transformer.model.FileRefEntity;
|
||||
import org.alfresco.transformer.model.FileRefResponse;
|
||||
import org.alfresco.transformer.probes.ProbeTestTransform;
|
||||
import org.alfresco.transform.base.executors.RuntimeExec;
|
||||
import org.alfresco.transform.base.model.FileRefEntity;
|
||||
import org.alfresco.transform.base.model.FileRefResponse;
|
||||
import org.alfresco.transform.base.probes.ProbeTestTransform;
|
||||
import org.apache.poi.ooxml.POIXMLProperties;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@@ -142,9 +144,6 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
|
||||
@Mock
|
||||
private RuntimeExec mockCheckCommand;
|
||||
|
||||
@Autowired
|
||||
protected AbstractTransformerController controller;
|
||||
|
||||
private String targetEncoding = "UTF-8";
|
||||
private String targetMimetype = MIMETYPE_TEXT_PLAIN;
|
||||
|
||||
@@ -236,12 +235,6 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
|
||||
when(mockExecutionResult.getStdOut()).thenReturn("STDOUT");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AbstractTransformerController getController()
|
||||
{
|
||||
return controller;
|
||||
}
|
||||
|
||||
private void transform(String transform, String sourceExtension, String targetExtension,
|
||||
String sourceMimetype, String targetMimetype,
|
||||
Boolean includeContents, String expectedContentContains) throws Exception
|
||||
@@ -284,9 +277,10 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
|
||||
public void testImmutableEmptyMap()
|
||||
{
|
||||
// See ACS-373
|
||||
ProbeTestTransform probeTestTransform = getController().getProbeTestTransform();
|
||||
TransformController controller = getController();
|
||||
ProbeTestTransform probeTestTransform = getProbeTestTransform();
|
||||
ReflectionTestUtils.setField(probeTestTransform, "livenessTransformEnabled", true);
|
||||
probeTestTransform.doTransformOrNothing(httpServletRequest, true);
|
||||
probeTestTransform.doTransformOrNothing(httpServletRequest, true, controller);
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -483,14 +477,14 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
|
||||
@Test
|
||||
public void msgToTxtOfficeTest() throws Exception
|
||||
{
|
||||
transform(POI_OFFICE, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null,
|
||||
transform(OFFICE, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_MSG_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void docToTxtOfficeTest() throws Exception
|
||||
{
|
||||
transform(POI_OFFICE, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null,
|
||||
transform(OFFICE, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@@ -508,14 +502,14 @@ public class TikaControllerTest extends AbstractTransformerControllerTest
|
||||
@Test
|
||||
public void docxToTxtOoXmlTest() throws Exception
|
||||
{
|
||||
transform(POI_OO_XML, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null,
|
||||
transform(OOXML, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pptxToTxtOoXmlTest() throws Exception
|
||||
{
|
||||
transform(POI_OO_XML, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null,
|
||||
transform(OOXML, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null,
|
||||
EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
@@ -24,15 +24,11 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import static org.springframework.http.MediaType.MULTIPART_FORM_DATA;
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import org.alfresco.transform.base.AbstractHttpRequestTest;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.context.SpringBootTest.WebEnvironment;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.http.HttpEntity;
|
||||
import org.springframework.http.HttpHeaders;
|
||||
import org.springframework.util.LinkedMultiValueMap;
|
||||
|
||||
/**
|
@@ -24,11 +24,11 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_APP_DWG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OUTLOOK_MSG;
|
||||
import static org.alfresco.transformer.TestFileInfo.testFile;
|
||||
import static org.alfresco.transform.base.TestFileInfo.testFile;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_AUDIO_MP4;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_EXCEL;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_BMP;
|
||||
@@ -74,6 +74,8 @@ import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_RAW_NEF;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.alfresco.transform.base.AbstractMetadataExtractsIT;
|
||||
import org.alfresco.transform.base.TestFileInfo;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_OPENXML_WORDPROCESSING;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
@@ -32,6 +32,7 @@ import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
import java.util.UUID;
|
||||
|
||||
import org.alfresco.transform.client.model.TransformRequest;
|
||||
import org.alfresco.transform.base.AbstractQueueTransformServiceIT;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
|
||||
/**
|
@@ -24,11 +24,11 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import static java.text.MessageFormat.format;
|
||||
import static java.util.function.Function.identity;
|
||||
import static org.alfresco.transformer.EngineClient.sendTRequest;
|
||||
import static org.alfresco.transform.base.EngineClient.sendTRequest;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
import static org.springframework.http.HttpStatus.OK;
|
@@ -14,7 +14,7 @@
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<artifactId>alfresco-transformer-base</artifactId>
|
||||
<artifactId>alfresco-t-engine-base</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
|
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika;
|
||||
|
||||
import org.alfresco.transform.base.TransformEngine;
|
||||
import org.alfresco.transform.base.probes.ProbeTestTransform;
|
||||
import org.alfresco.transform.common.TransformConfigResourceReader;
|
||||
import org.alfresco.transform.config.TransformConfig;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_PDF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
|
||||
@Component
|
||||
public class TikaTransformEngine implements TransformEngine
|
||||
{
|
||||
private static final String LICENCE =
|
||||
"This transformer uses Tika from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt\n" +
|
||||
"This transformer uses ExifTool by Phil Harvey. See license at https://exiftool.org/#license. or in /Perl-Artistic-License.txt";
|
||||
|
||||
@Autowired
|
||||
private TransformConfigResourceReader transformConfigResourceReader;
|
||||
@Value("${transform.core.config.location:classpath:engine_config.json}")
|
||||
private String engineConfigLocation;
|
||||
|
||||
@Override
|
||||
public String getTransformEngineName()
|
||||
{
|
||||
return "0001-Tika";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getStartupMessage() {
|
||||
return LICENCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TransformConfig getTransformConfig()
|
||||
{
|
||||
return transformConfigResourceReader.read(engineConfigLocation);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ProbeTestTransform getLivenessAndReadinessProbeTestTransform()
|
||||
{
|
||||
return new ProbeTestTransform("quick.pdf", "quick.txt",
|
||||
MIMETYPE_PDF, MIMETYPE_TEXT_PLAIN, Collections.emptyMap(),
|
||||
60, 16, 400, 10240, 60 * 30 + 1, 60 * 15 + 20);
|
||||
}
|
||||
}
|
@@ -24,13 +24,15 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.alfresco.transform.base.CustomTransformer;
|
||||
import org.alfresco.transform.common.TransformException;
|
||||
import org.alfresco.transform.base.metadataExtractors.AbstractMetadataExtractor;
|
||||
import org.apache.tika.embedder.Embedder;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.DublinCore;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.OfficeOpenXMLCore;
|
||||
import org.apache.tika.metadata.Property;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
@@ -67,6 +69,8 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* The parent of all Metadata Extractors which use Apache Tika under the hood. This handles all the
|
||||
* common parts of processing the files, and the common mappings.
|
||||
@@ -82,7 +86,7 @@ import java.util.stream.Stream;
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtractor
|
||||
public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtractor implements CustomTransformer
|
||||
{
|
||||
protected static final String KEY_AUTHOR = "author";
|
||||
protected static final String KEY_TITLE = "title";
|
||||
@@ -97,9 +101,17 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
private final DateTimeFormatter tikaUTCDateFormater;
|
||||
private final DateTimeFormatter tikaDateFormater;
|
||||
|
||||
public AbstractTikaMetadataExtractor(Logger logger)
|
||||
public static enum Type
|
||||
{
|
||||
EXTRACTOR, EMBEDDER
|
||||
}
|
||||
|
||||
private final Type type;
|
||||
|
||||
public AbstractTikaMetadataExtractor(Type type, Logger logger)
|
||||
{
|
||||
super(logger);
|
||||
this.type = type;
|
||||
|
||||
// TODO Once TIKA-451 is fixed this list will get nicer
|
||||
DateTimeParser[] parsersUTC = {
|
||||
@@ -118,6 +130,26 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
tikaDateFormater = new DateTimeFormatterBuilder().append(null, parsers).toFormatter();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTransformerName() {
|
||||
return getClass().getSimpleName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transform(String sourceMimetype, String sourceEncoding, InputStream inputStream,
|
||||
String targetMimetype, String targetEncoding, OutputStream outputStream,
|
||||
Map<String, String> transformOptions) throws Exception
|
||||
{
|
||||
if (type == EXTRACTOR)
|
||||
{
|
||||
extractMetadata(sourceMimetype, transformOptions, sourceEncoding, inputStream, targetEncoding, outputStream);
|
||||
}
|
||||
else
|
||||
{
|
||||
embedMetadata(sourceMimetype, transformOptions, sourceEncoding, inputStream, targetEncoding, outputStream);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Version which also tries the ISO-8601 formats (in order..),
|
||||
* and similar formats, which Tika makes use of
|
||||
@@ -308,6 +340,14 @@ public abstract class AbstractTikaMetadataExtractor extends AbstractMetadataExtr
|
||||
return rawProperties;
|
||||
}
|
||||
|
||||
public void embedMetadata(String sourceMimetype, Map<String, String> transformOptions,
|
||||
String sourceEncoding, InputStream inputStream,
|
||||
String targetEncoding, OutputStream outputStream) throws Exception
|
||||
{
|
||||
// TODO
|
||||
throw new TransformException(500, "TODO embedMetadata");
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated The content repository's TikaPoweredMetadataExtracter provides no non test implementations.
|
||||
* This code exists in case there are custom implementations, that need to be converted to T-Engines.
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
@@ -32,10 +32,13 @@ import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.dwg.DWGParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* {@code "application/dwg"} and {@code "image/vnd.dwg"} metadata extractor.
|
||||
*
|
||||
@@ -53,6 +56,7 @@ import java.util.Map;
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class DWGMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(DWGMetadataExtractor.class);
|
||||
@@ -62,7 +66,7 @@ public class DWGMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
|
||||
public DWGMetadataExtractor()
|
||||
{
|
||||
super(logger);
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
@@ -33,13 +33,17 @@ import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.alfresco.transformer.tika.parsers.ExifToolParser;
|
||||
import org.alfresco.transform.tika.parsers.ExifToolParser;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
@Component
|
||||
public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
|
||||
@@ -53,7 +57,7 @@ public class IPTCMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
|
||||
public IPTCMetadataExtractor()
|
||||
{
|
||||
super(logger);
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
@@ -33,10 +33,13 @@ import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.mp3.Mp3Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* MP3 file metadata extractor.
|
||||
*
|
||||
@@ -63,6 +66,7 @@ import java.util.Map;
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class MP3MetadataExtractor extends TikaAudioMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(MP3MetadataExtractor.class);
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Message;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
@@ -33,10 +33,13 @@ import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* Outlook MAPI format email metadata extractor.
|
||||
*
|
||||
@@ -59,6 +62,7 @@ import java.util.Map;
|
||||
* @author Kevin Roast
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class MailMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(MailMetadataExtractor.class);
|
||||
@@ -74,7 +78,7 @@ public class MailMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
|
||||
public MailMetadataExtractor()
|
||||
{
|
||||
super(logger);
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Office;
|
||||
@@ -33,10 +33,13 @@ import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* Office file format metadata extractor.
|
||||
*
|
||||
@@ -67,6 +70,7 @@ import java.util.Map;
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(OfficeMetadataExtractor.class);
|
||||
@@ -84,7 +88,7 @@ public class OfficeMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
|
||||
public OfficeMetadataExtractor()
|
||||
{
|
||||
super(logger);
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,8 +24,9 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
|
||||
import static org.apache.tika.metadata.DublinCore.NAMESPACE_URI_DC;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
@@ -40,6 +41,7 @@ import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.xml.sax.ContentHandler;
|
||||
|
||||
import java.io.Serializable;
|
||||
@@ -76,6 +78,7 @@ import java.util.stream.Collectors;
|
||||
* @author Derek Hulley
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(OpenDocumentMetadataExtractor.class);
|
||||
@@ -95,7 +98,7 @@ public class OpenDocumentMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
|
||||
public OpenDocumentMetadataExtractor()
|
||||
{
|
||||
super(logger);
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,15 +24,18 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.alfresco.transformer.executors.Tika;
|
||||
import org.alfresco.transform.tika.transformers.Tika;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* Metadata extractor for the PDF documents.
|
||||
@@ -52,13 +55,14 @@ import org.slf4j.LoggerFactory;
|
||||
* @author Derek Hulley
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class PdfBoxMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(PdfBoxMetadataExtractor.class);
|
||||
|
||||
public PdfBoxMetadataExtractor()
|
||||
{
|
||||
super(logger);
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.poi.ooxml.POIXMLProperties;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
@@ -36,6 +36,7 @@ import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
@@ -44,6 +45,8 @@ import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* POI-based metadata extractor for Office 07 documents. See http://poi.apache.org/ for information on POI.
|
||||
*
|
||||
@@ -92,13 +95,14 @@ import java.util.StringJoiner;
|
||||
* @author Dmitry Velichkevich
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class PoiMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(PoiMetadataExtractor.class);
|
||||
|
||||
public PoiMetadataExtractor()
|
||||
{
|
||||
super(logger);
|
||||
super(EXTRACTOR, logger);
|
||||
}
|
||||
|
||||
@Override
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
@@ -37,12 +37,14 @@ import org.gagravarr.tika.FlacParser;
|
||||
import org.gagravarr.tika.VorbisParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Calendar;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transformer.executors.Tika.readTikaConfig;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
|
||||
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* A Metadata Extractor which makes use of the Apache Tika Audio Parsers to extract metadata from media files.
|
||||
@@ -66,6 +68,7 @@ import static org.alfresco.transformer.executors.Tika.readTikaConfig;
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TikaAudioMetadataExtractor.class);
|
||||
@@ -86,7 +89,7 @@ public class TikaAudioMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
|
||||
public TikaAudioMetadataExtractor(Logger logger)
|
||||
{
|
||||
super(logger);
|
||||
super(EXTRACTOR, logger);
|
||||
tikaConfig = readTikaConfig(logger);
|
||||
}
|
||||
|
@@ -24,9 +24,8 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import org.alfresco.transform.common.Mimetype;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TIFF;
|
||||
@@ -34,12 +33,14 @@ import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transformer.executors.Tika.readTikaConfig;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.readTikaConfig;
|
||||
import static org.alfresco.transform.tika.metadataExtractors.AbstractTikaMetadataExtractor.Type.EXTRACTOR;
|
||||
|
||||
/**
|
||||
* A Metadata Extractor which makes use of the Apache Tika auto-detection to select the best parser to extract the
|
||||
@@ -61,6 +62,7 @@ import static org.alfresco.transformer.executors.Tika.readTikaConfig;
|
||||
* @author Nick Burch
|
||||
* @author adavis
|
||||
*/
|
||||
@Component
|
||||
public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TikaAutoMetadataExtractor.class);
|
||||
@@ -75,7 +77,7 @@ public class TikaAutoMetadataExtractor extends AbstractTikaMetadataExtractor
|
||||
|
||||
public TikaAutoMetadataExtractor()
|
||||
{
|
||||
super(logger);
|
||||
super(EXTRACTOR, logger);
|
||||
tikaConfig = readTikaConfig(logger);
|
||||
}
|
||||
|
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.tika.parsers;
|
||||
package org.alfresco.transform.tika.parsers;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.executors;
|
||||
package org.alfresco.transform.tika.parsers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import javax.annotation.PostConstruct;
|
||||
|
||||
@Component
|
||||
public class ArchiveTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.packageParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.alfresco.transform.base.CustomTransformer;
|
||||
import org.alfresco.transform.base.logging.LogEntry;
|
||||
import org.alfresco.transform.base.util.RequestParamMap;
|
||||
import org.alfresco.transform.common.TransformException;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import static java.lang.Boolean.parseBoolean;
|
||||
|
||||
public abstract class GenericTikaTransformer implements CustomTransformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(GenericTikaTransformer.class);
|
||||
|
||||
@Value("${transform.core.tika.pdfBox.notExtractBookmarksTextDefault:false}")
|
||||
boolean notExtractBookmarksTextDefault;
|
||||
|
||||
@Autowired
|
||||
protected Tika tika;
|
||||
|
||||
protected abstract Parser getParser();
|
||||
|
||||
protected DocumentSelector getDocumentSelector()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTransformerName()
|
||||
{
|
||||
String simpleClassName = getClass().getSimpleName();
|
||||
return simpleClassName.substring(0, simpleClassName.length()-"Transformer".length());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transform(String sourceMimetype, String sourceEncoding, InputStream inputStream,
|
||||
String targetMimetype, String targetEncoding, OutputStream outputStream,
|
||||
Map<String, String> transformOptions) throws Exception
|
||||
{
|
||||
// TODO
|
||||
throw new TransformException(500, "TODO GenericTikaTransformer transform with InputStreams");
|
||||
}
|
||||
|
||||
public void transform(String transformName, String sourceMimetype, String targetMimetype,
|
||||
Map<String, String> transformOptions, File sourceFile, File targetFile)
|
||||
throws Exception
|
||||
{
|
||||
final boolean includeContents = parseBoolean(
|
||||
transformOptions.getOrDefault(RequestParamMap.INCLUDE_CONTENTS, "false"));
|
||||
final boolean notExtractBookmarksText = parseBoolean(
|
||||
transformOptions.getOrDefault(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT, String.valueOf(notExtractBookmarksTextDefault)));
|
||||
final String targetEncoding = transformOptions.getOrDefault("targetEncoding", "UTF-8");
|
||||
if (transformOptions.get(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT) == null && notExtractBookmarksTextDefault)
|
||||
{
|
||||
logger.trace("notExtractBookmarksText default value has been overridden to {}", notExtractBookmarksTextDefault);
|
||||
}
|
||||
call(sourceFile, targetFile, transformName,
|
||||
includeContents ? Tika.INCLUDE_CONTENTS : null,
|
||||
notExtractBookmarksText ? Tika.NOT_EXTRACT_BOOKMARKS_TEXT : null,
|
||||
Tika.TARGET_MIMETYPE + targetMimetype, Tika.TARGET_ENCODING + targetEncoding);
|
||||
}
|
||||
|
||||
void call(File sourceFile, File targetFile, String... args)
|
||||
{
|
||||
Parser parser = getParser();
|
||||
DocumentSelector documentSelector = getDocumentSelector();
|
||||
args = buildArgs(sourceFile, targetFile, args);
|
||||
tika.transform(parser, documentSelector, args);
|
||||
}
|
||||
|
||||
private static String[] buildArgs(File sourceFile, File targetFile, String[] args)
|
||||
{
|
||||
ArrayList<String> methodArgs = new ArrayList<>(args.length + 2);
|
||||
StringJoiner sj = new StringJoiner(" ");
|
||||
for (String arg : args)
|
||||
{
|
||||
addArg(methodArgs, sj, arg);
|
||||
}
|
||||
|
||||
addFileArg(methodArgs, sj, sourceFile);
|
||||
addFileArg(methodArgs, sj, targetFile);
|
||||
|
||||
LogEntry.setOptions(sj.toString());
|
||||
|
||||
return methodArgs.toArray(new String[0]);
|
||||
}
|
||||
|
||||
private static void addArg(ArrayList<String> methodArgs, StringJoiner sj, String arg)
|
||||
{
|
||||
if (arg != null)
|
||||
{
|
||||
sj.add(arg);
|
||||
methodArgs.add(arg);
|
||||
}
|
||||
}
|
||||
|
||||
private static void addFileArg(ArrayList<String> methodArgs, StringJoiner sj, File arg)
|
||||
{
|
||||
if (arg != null)
|
||||
{
|
||||
String path = arg.getAbsolutePath();
|
||||
int i = path.lastIndexOf('.');
|
||||
String ext = i == -1 ? "???" : path.substring(i + 1);
|
||||
sj.add(ext);
|
||||
methodArgs.add(path);
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class OOXMLTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.ooXmlParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class OfficeTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.officeParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class OutlookMsgTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.officeParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class PdfBoxTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.pdfParser;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DocumentSelector getDocumentSelector()
|
||||
{
|
||||
return tika.pdfBoxEmbededDocumentSelector;
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class PoiTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.tikaOfficeDetectParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class TextMiningTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.officeParser;
|
||||
}
|
||||
}
|
@@ -0,0 +1,446 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.alfresco.transform.tika.parsers.TikaOfficeDetectParser;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.EmptyParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||
import org.apache.tika.parser.pkg.PackageParser;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.apache.tika.sax.ExpandedTitleContentHandler;
|
||||
import org.slf4j.Logger;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.TransformerConfigurationException;
|
||||
import javax.xml.transform.sax.SAXTransformerFactory;
|
||||
import javax.xml.transform.sax.TransformerHandler;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_PNG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_TIFF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_CSV;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
|
||||
|
||||
@Component
|
||||
public class Tika
|
||||
{
|
||||
public static final String ARCHIVE = "Archive";
|
||||
public static final String OUTLOOK_MSG = "OutlookMsg";
|
||||
public static final String PDF_BOX = "PdfBox";
|
||||
public static final String OFFICE = "Office";
|
||||
public static final String POI = "Poi";
|
||||
public static final String OOXML = "OOXML";
|
||||
public static final String TIKA_AUTO = "TikaAuto";
|
||||
public static final String TEXT_MINING = "TextMining";
|
||||
|
||||
public static final String TARGET_MIMETYPE = "--targetMimetype=";
|
||||
public static final String TARGET_ENCODING = "--targetEncoding=";
|
||||
public static final String INCLUDE_CONTENTS = "--includeContents";
|
||||
public static final String NOT_EXTRACT_BOOKMARKS_TEXT = "--notExtractBookmarksText";
|
||||
|
||||
public static final String CSV = "csv";
|
||||
public static final String DOC = "doc";
|
||||
public static final String DOCX = "docx";
|
||||
public static final String HTML = "html";
|
||||
public static final String MSG = "msg";
|
||||
public static final String PDF = "pdf";
|
||||
public static final String PPTX = "pptx";
|
||||
public static final String TXT = "txt";
|
||||
public static final String XHTML = "xhtml";
|
||||
public static final String XSLX = "xslx";
|
||||
public static final String XML = "xml";
|
||||
public static final String ZIP = "zip";
|
||||
|
||||
public static final Parser packageParser = new PackageParser();
|
||||
public static final Parser pdfParser = new PDFParser();
|
||||
public static final Parser officeParser = new OfficeParser();
|
||||
public final Parser autoDetectParser;
|
||||
public static final Parser ooXmlParser = new OOXMLParser();
|
||||
public static final Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
|
||||
public final PDFParserConfig pdfParserConfig = new PDFParserConfig();
|
||||
|
||||
public static final DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
|
||||
{
|
||||
private final List<String> disabledMediaTypes = ImmutableList.of(MIMETYPE_IMAGE_JPEG,
|
||||
MIMETYPE_IMAGE_TIFF, MIMETYPE_IMAGE_PNG);
|
||||
|
||||
@Override
|
||||
public boolean select(Metadata metadata)
|
||||
{
|
||||
String contentType = metadata.get(Metadata.CONTENT_TYPE);
|
||||
if (contentType == null || contentType.equals("") || disabledMediaTypes == null)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return !disabledMediaTypes.contains(contentType);
|
||||
}
|
||||
};
|
||||
|
||||
public Tika() throws TikaException, IOException, SAXException
|
||||
{
|
||||
TikaConfig tikaConfig = readTikaConfig();
|
||||
autoDetectParser = new AutoDetectParser(tikaConfig);
|
||||
}
|
||||
|
||||
public static TikaConfig readTikaConfig(Logger logger)
|
||||
{
|
||||
try
|
||||
{
|
||||
return readTikaConfig();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.error("Failed to read tika-config.xml", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static TikaConfig readTikaConfig() throws TikaException, IOException, SAXException
|
||||
{
|
||||
ClassLoader classLoader = Tika.class.getClassLoader();
|
||||
URL tikaConfigXml = classLoader.getResource("tika-config.xml");
|
||||
return new TikaConfig(tikaConfigXml);
|
||||
}
|
||||
|
||||
// Extracts parameters form args
|
||||
public void transform(Parser parser, DocumentSelector documentSelector, String[] args)
|
||||
{
|
||||
String transform = null;
|
||||
String targetMimetype = null;
|
||||
String targetEncoding = null;
|
||||
String sourceFilename = null;
|
||||
String targetFilename = null;
|
||||
Boolean includeContents = null;
|
||||
Boolean notExtractBookmarksText = null;
|
||||
|
||||
for (String arg : args)
|
||||
{
|
||||
if (arg.startsWith("--"))
|
||||
{
|
||||
if (INCLUDE_CONTENTS.startsWith(arg))
|
||||
{
|
||||
getValue(arg, false, includeContents, INCLUDE_CONTENTS);
|
||||
includeContents = true;
|
||||
}
|
||||
else if (arg.startsWith(TARGET_ENCODING))
|
||||
{
|
||||
targetEncoding = getValue(arg, true, targetEncoding, TARGET_ENCODING);
|
||||
}
|
||||
else if (arg.startsWith(TARGET_MIMETYPE))
|
||||
{
|
||||
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
|
||||
}
|
||||
else if (arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT))
|
||||
{
|
||||
getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT);
|
||||
notExtractBookmarksText = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected argument " + arg);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (transform == null)
|
||||
{
|
||||
transform = arg;
|
||||
}
|
||||
else if (sourceFilename == null)
|
||||
{
|
||||
sourceFilename = arg;
|
||||
}
|
||||
else if (targetFilename == null)
|
||||
{
|
||||
targetFilename = arg;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected argument " + arg);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (targetFilename == null)
|
||||
{
|
||||
throw new IllegalArgumentException("Missing arguments");
|
||||
}
|
||||
includeContents = includeContents == null ? false : includeContents;
|
||||
notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText;
|
||||
|
||||
transform(parser, documentSelector, includeContents, notExtractBookmarksText, sourceFilename,
|
||||
targetFilename, targetMimetype, targetEncoding);
|
||||
}
|
||||
|
||||
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
|
||||
{
|
||||
if (value != null)
|
||||
{
|
||||
throw new IllegalArgumentException("Duplicate " + optionName);
|
||||
}
|
||||
String stringValue = arg.substring(optionName.length()).trim();
|
||||
if (!valueExpected && stringValue.length() > 0)
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected value with " + optionName);
|
||||
}
|
||||
if (valueExpected && stringValue.length() == 0)
|
||||
{
|
||||
throw new IllegalArgumentException("Expected value with " + optionName);
|
||||
}
|
||||
return stringValue;
|
||||
}
|
||||
|
||||
private void transform(Parser parser, DocumentSelector documentSelector,
|
||||
Boolean includeContents,
|
||||
Boolean notExtractBookmarksText,
|
||||
String sourceFilename,
|
||||
String targetFilename, String targetMimetype, String targetEncoding)
|
||||
{
|
||||
|
||||
try (InputStream is = new BufferedInputStream(new FileInputStream(sourceFilename));
|
||||
OutputStream os = new FileOutputStream(targetFilename);
|
||||
Writer ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding)))
|
||||
{
|
||||
Metadata metadata = new Metadata();
|
||||
ParseContext context = buildParseContext(documentSelector, includeContents,
|
||||
notExtractBookmarksText);
|
||||
ContentHandler handler = getContentHandler(targetMimetype, ow);
|
||||
|
||||
parser.parse(is, handler, metadata, context);
|
||||
}
|
||||
catch (SAXException | TikaException | IOException e)
|
||||
{
|
||||
throw new IllegalStateException(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private ContentHandler getContentHandler(String targetMimetype, Writer output)
|
||||
{
|
||||
try
|
||||
{
|
||||
ContentHandler handler;
|
||||
if (MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
||||
{
|
||||
handler = new BodyContentHandler(output);
|
||||
}
|
||||
else
|
||||
{
|
||||
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
|
||||
TransformerHandler transformerHandler;
|
||||
transformerHandler = factory.newTransformerHandler();
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
transformerHandler.setResult(new StreamResult(output));
|
||||
handler = transformerHandler;
|
||||
|
||||
if (MIMETYPE_HTML.equals(targetMimetype))
|
||||
{
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, HTML);
|
||||
return new ExpandedTitleContentHandler(transformerHandler);
|
||||
}
|
||||
else if (MIMETYPE_XHTML.equals(targetMimetype) ||
|
||||
MIMETYPE_XML.equals(targetMimetype))
|
||||
{
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, XML);
|
||||
}
|
||||
else if (MIMETYPE_TEXT_CSV.equals(targetMimetype))
|
||||
{
|
||||
handler = new CsvContentHandler(output);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Invalid target mimetype " + targetMimetype);
|
||||
}
|
||||
}
|
||||
return handler;
|
||||
}
|
||||
catch (TransformerConfigurationException e)
|
||||
{
|
||||
throw new IllegalStateException(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A wrapper around the normal Tika BodyContentHandler for CSV rather encoding than tab separated.
|
||||
*/
|
||||
protected static class CsvContentHandler extends BodyContentHandler
|
||||
{
|
||||
private static final char[] comma = new char[]{','};
|
||||
private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+");
|
||||
|
||||
private boolean inCell = false;
|
||||
private boolean needsComma = false;
|
||||
|
||||
protected CsvContentHandler(Writer output)
|
||||
{
|
||||
super(output);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void ignorableWhitespace(char[] ch, int start, int length)
|
||||
throws SAXException
|
||||
{
|
||||
if (length == 1 && ch[0] == '\t')
|
||||
{
|
||||
// Ignore tabs, as they mess up the CSV output
|
||||
}
|
||||
else
|
||||
{
|
||||
super.ignorableWhitespace(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length)
|
||||
throws SAXException
|
||||
{
|
||||
if (inCell)
|
||||
{
|
||||
StringBuffer t = new StringBuffer(new String(ch, start, length));
|
||||
|
||||
// Quote if not all numbers
|
||||
if (all_nums.matcher(t).matches())
|
||||
{
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = t.length() - 1; i >= 0; i--)
|
||||
{
|
||||
if (t.charAt(i) == '\"')
|
||||
{
|
||||
// Double up double quotes
|
||||
t.insert(i, '\"');
|
||||
i--;
|
||||
}
|
||||
}
|
||||
t.insert(0, '\"');
|
||||
t.append('\"');
|
||||
char[] c = t.toString().toCharArray();
|
||||
super.characters(c, 0, c.length);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String name,
|
||||
Attributes atts) throws SAXException
|
||||
{
|
||||
if (localName.equals("td"))
|
||||
{
|
||||
inCell = true;
|
||||
if (needsComma)
|
||||
{
|
||||
super.characters(comma, 0, 1);
|
||||
needsComma = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
super.startElement(uri, localName, name, atts);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String name)
|
||||
throws SAXException
|
||||
{
|
||||
if (localName.equals("td"))
|
||||
{
|
||||
needsComma = true;
|
||||
inCell = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (localName.equals("tr"))
|
||||
{
|
||||
needsComma = false;
|
||||
}
|
||||
super.endElement(uri, localName, name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private ParseContext buildParseContext(DocumentSelector documentSelector,
|
||||
Boolean includeContents, Boolean notExtractBookmarksText)
|
||||
{
|
||||
ParseContext context = new ParseContext();
|
||||
|
||||
if (documentSelector != null)
|
||||
{
|
||||
context.set(DocumentSelector.class, documentSelector);
|
||||
}
|
||||
|
||||
if (notExtractBookmarksText.equals(true))
|
||||
{
|
||||
pdfParserConfig.setExtractBookmarksText(false);
|
||||
// pdfParserConfig is set to override default settings
|
||||
context.set(PDFParserConfig.class, pdfParserConfig);
|
||||
}
|
||||
|
||||
// If Archive transform
|
||||
if (includeContents != null)
|
||||
{
|
||||
context.set(Parser.class, includeContents ? autoDetectParser : new EmptyParser());
|
||||
}
|
||||
|
||||
return context;
|
||||
}
|
||||
}
|
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
public class TikaAutoTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return tika.autoDetectParser;
|
||||
}
|
||||
}
|
@@ -1,876 +0,0 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.executors;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.EmptyParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||
import org.apache.tika.parser.pkg.PackageParser;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.apache.tika.sax.ExpandedTitleContentHandler;
|
||||
import org.slf4j.Logger;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.TransformerConfigurationException;
|
||||
import javax.xml.transform.sax.SAXTransformerFactory;
|
||||
import javax.xml.transform.sax.TransformerHandler;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_PNG;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_IMAGE_TIFF;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_CSV;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XHTML;
|
||||
import static org.alfresco.transform.common.Mimetype.MIMETYPE_XML;
|
||||
|
||||
/**
|
||||
* Stripped down command line Tika transformers. Not actually run as a separate process, but the code fits the patten
|
||||
* used by transformers that do.
|
||||
* <pre>
|
||||
*
|
||||
* Archive 0 ms
|
||||
* 1) cpio html [100] unlimited
|
||||
* 2) cpio txt [50] unlimited
|
||||
* 3) cpio xhtml [100] unlimited
|
||||
* 4) cpio xml [100] unlimited
|
||||
* 5) jar html [100] unlimited
|
||||
* 6) jar txt [50] unlimited
|
||||
* 7) jar xhtml [100] unlimited
|
||||
* 8) jar xml [100] unlimited
|
||||
* 9) tar html [100] unlimited
|
||||
* 10) tar txt [50] unlimited
|
||||
* 11) tar xhtml [100] unlimited
|
||||
* 12) tar xml [100] unlimited
|
||||
* 13) zip html [100] unlimited
|
||||
* 14) zip txt [50] unlimited
|
||||
* 15) zip xhtml [100] unlimited
|
||||
* 16) zip xml [100] unlimited
|
||||
* PdfBox 0 ms
|
||||
* 1) pdf html [110] unlimited
|
||||
* 2) pdf txt [50] 25 MB
|
||||
* 3) pdf xhtml [110] unlimited
|
||||
* 4) pdf xml [110] unlimited
|
||||
* OutlookMsg 0 ms
|
||||
* 1) msg html [125] unlimited
|
||||
* 2) msg txt [125] unlimited
|
||||
* 3) msg xhtml [125] unlimited
|
||||
* 4) msg xml [125] unlimited
|
||||
* PdfBox 0 ms
|
||||
* 1) pdf html [110] unlimited
|
||||
* 2) pdf txt [50] 25 MB
|
||||
* 3) pdf xhtml [110] unlimited
|
||||
* 4) pdf xml [110] unlimited
|
||||
* Office 0 ms
|
||||
* 1) doc html [130] unlimited
|
||||
* 2) doc txt [130] unlimited
|
||||
* 3) doc xhtml [130] unlimited
|
||||
* 4) doc xml [130] unlimited
|
||||
* 5) mpp html [130] unlimited
|
||||
* 6) mpp txt [130] unlimited
|
||||
* 7) mpp xhtml [130] unlimited
|
||||
* 8) mpp xml [130] unlimited
|
||||
* 9) msg html [130] unlimited
|
||||
* 10) msg txt [130] unlimited
|
||||
* 11) msg xhtml [130] unlimited
|
||||
* 12) msg xml [130] unlimited
|
||||
* 13) ppt html [130] unlimited
|
||||
* 14) ppt txt [130] unlimited
|
||||
* 15) ppt xhtml [130] unlimited
|
||||
* 16) ppt xml [130] unlimited
|
||||
* 17) vsd html [130] unlimited
|
||||
* 18) vsd txt [130] unlimited
|
||||
* 19) vsd xhtml [130] unlimited
|
||||
* 20) vsd xml [130] unlimited
|
||||
* Poi 0 ms
|
||||
* 1) xls csv [130] unlimited
|
||||
* 2) xls html [130] unlimited
|
||||
* 3) xls txt [130] unlimited
|
||||
* 4) xls xhtml [130] unlimited
|
||||
* 5) xls xml [130] unlimited
|
||||
* 6) xlsx csv [130] unlimited
|
||||
* 7) xlsx html [130] unlimited
|
||||
* 8) xlsx txt [130] unlimited
|
||||
* 9) xlsx xhtml [130] unlimited
|
||||
* 10) xlsx xml [130] unlimited
|
||||
* OOXML 0 ms
|
||||
* 1) docm html [130] unlimited
|
||||
* 2) docm txt [130] unlimited
|
||||
* 3) docm xhtml [130] unlimited
|
||||
* 4) docm xml [130] unlimited
|
||||
* 5) docx html [130] unlimited
|
||||
* 6) docx txt [130] unlimited
|
||||
* 7) docx xhtml [130] unlimited
|
||||
* 8) docx xml [130] unlimited
|
||||
* 9) dotm html [130] unlimited
|
||||
* 10) dotm txt [130] unlimited
|
||||
* 11) dotm xhtml [130] unlimited
|
||||
* 12) dotm xml [130] unlimited
|
||||
* 13) dotx html [130] unlimited
|
||||
* 14) dotx txt [130] unlimited
|
||||
* 15) dotx xhtml [130] unlimited
|
||||
* 16) dotx xml [130] unlimited
|
||||
* 17) potm html [130] unlimited
|
||||
* 18) potm txt [130] unlimited
|
||||
* 19) potm xhtml [130] unlimited
|
||||
* 20) potm xml [130] unlimited
|
||||
* 21) potx html [130] unlimited
|
||||
* 22) potx txt [130] unlimited
|
||||
* 23) potx xhtml [130] unlimited
|
||||
* 24) potx xml [130] unlimited
|
||||
* 25) ppam html [130] unlimited
|
||||
* 26) ppam txt [130] unlimited
|
||||
* 27) ppam xhtml [130] unlimited
|
||||
* 28) ppam xml [130] unlimited
|
||||
* 29) ppsm html [130] unlimited
|
||||
* 30) ppsm txt [130] unlimited
|
||||
* 31) ppsm xhtml [130] unlimited
|
||||
* 32) ppsm xml [130] unlimited
|
||||
* 33) ppsx html [130] unlimited
|
||||
* 34) ppsx txt [130] unlimited
|
||||
* 35) ppsx xhtml [130] unlimited
|
||||
* 36) ppsx xml [130] unlimited
|
||||
* 37) pptm html [130] unlimited
|
||||
* 38) pptm txt [130] unlimited
|
||||
* 39) pptm xhtml [130] unlimited
|
||||
* 40) pptm xml [130] unlimited
|
||||
* 41) pptx html [130] unlimited
|
||||
* 42) pptx txt [130] unlimited
|
||||
* 43) pptx xhtml [130] unlimited
|
||||
* 44) pptx xml [130] unlimited
|
||||
* 45) sldm html [130] unlimited
|
||||
* 46) sldm txt [130] unlimited
|
||||
* 47) sldm xhtml [130] unlimited
|
||||
* 48) sldm xml [130] unlimited
|
||||
* 49) sldx html [130] unlimited
|
||||
* 50) sldx txt [130] unlimited
|
||||
* 51) sldx xhtml [130] unlimited
|
||||
* 52) sldx xml [130] unlimited
|
||||
* 53) xlam html [130] unlimited
|
||||
* 54) xlam txt [130] unlimited
|
||||
* 55) xlam xhtml [130] unlimited
|
||||
* 56) xlam xml [130] unlimited
|
||||
* 57) xlsb html [130] unlimited
|
||||
* 58) xlsb txt [130] unlimited
|
||||
* 59) xlsb xhtml [130] unlimited
|
||||
* 60) xlsb xml [130] unlimited
|
||||
* 61) xlsm html [130] unlimited
|
||||
* 62) xlsm txt [130] unlimited
|
||||
* 63) xlsm xhtml [130] unlimited
|
||||
* 64) xlsm xml [130] unlimited
|
||||
* 65) xlsx html [130] unlimited
|
||||
* 66) xlsx txt [130] unlimited
|
||||
* 67) xlsx xhtml [130] unlimited
|
||||
* 68) xlsx xml [130] unlimited
|
||||
* 69) xltm html [130] unlimited
|
||||
* 70) xltm txt [130] unlimited
|
||||
* 71) xltm xhtml [130] unlimited
|
||||
* 72) xltm xml [130] unlimited
|
||||
* 73) xltx html [130] unlimited
|
||||
* 74) xltx txt [130] unlimited
|
||||
* 75) xltx xhtml [130] unlimited
|
||||
* 76) xltx xml [130] unlimited
|
||||
* TikaAuto 0 ms
|
||||
* 1) cdf html [120] unlimited
|
||||
* 2) cdf txt [120] unlimited
|
||||
* 3) cdf xhtml [120] unlimited
|
||||
* 4) cdf xml [120] unlimited
|
||||
* 5) cpio html [120] unlimited
|
||||
* 6) cpio txt [120] unlimited
|
||||
* 7) cpio xhtml [120] unlimited
|
||||
* 8) cpio xml [120] unlimited
|
||||
* 9) doc html [120] unlimited
|
||||
* 10) doc txt [120] unlimited
|
||||
* 11) doc xhtml [120] unlimited
|
||||
* 12) doc xml [120] unlimited
|
||||
* 13) docm html [120] unlimited
|
||||
* 14) docm txt [120] unlimited
|
||||
* 15) docm xhtml [120] unlimited
|
||||
* 16) docm xml [120] unlimited
|
||||
* 17) docx html [120] unlimited
|
||||
* 18) docx txt [120] unlimited
|
||||
* 19) docx xhtml [120] unlimited
|
||||
* 20) docx xml [120] unlimited
|
||||
* 21) dotm html [120] unlimited
|
||||
* 22) dotm txt [120] unlimited
|
||||
* 23) dotm xhtml [120] unlimited
|
||||
* 24) dotm xml [120] unlimited
|
||||
* 25) dotx html [120] unlimited
|
||||
* 26) dotx txt [120] unlimited
|
||||
* 27) dotx xhtml [120] unlimited
|
||||
* 28) dotx xml [120] unlimited
|
||||
* 29) gzip html [120] unlimited
|
||||
* 30) gzip txt [120] unlimited
|
||||
* 31) gzip xhtml [120] unlimited
|
||||
* 32) gzip xml [120] unlimited
|
||||
* 33) hdf html [120] unlimited
|
||||
* 34) hdf txt [120] unlimited
|
||||
* 35) hdf xhtml [120] unlimited
|
||||
* 36) hdf xml [120] unlimited
|
||||
* 37) html html [120] unlimited
|
||||
* 38) html txt [120] unlimited
|
||||
* 39) html xhtml [120] unlimited
|
||||
* 40) html xml [120] unlimited
|
||||
* 41) jar html [120] unlimited
|
||||
* 42) jar txt [120] unlimited
|
||||
* 43) jar xhtml [120] unlimited
|
||||
* 44) jar xml [120] unlimited
|
||||
* 45) java html [120] unlimited
|
||||
* 46) java txt [120] unlimited
|
||||
* 47) java xhtml [120] unlimited
|
||||
* 48) java xml [120] unlimited
|
||||
* 49) key html [120] unlimited
|
||||
* 50) key txt [120] unlimited
|
||||
* 51) key xhtml [120] unlimited
|
||||
* 52) key xml [120] unlimited
|
||||
* 53) mpp html [120] unlimited
|
||||
* 54) mpp txt [120] unlimited
|
||||
* 55) mpp xhtml [120] unlimited
|
||||
* 56) mpp xml [120] unlimited
|
||||
* 57) numbers html [120] unlimited
|
||||
* 58) numbers txt [120] unlimited
|
||||
* 59) numbers xhtml [120] unlimited
|
||||
* 60) numbers xml [120] unlimited
|
||||
* 61) odc html [120] unlimited
|
||||
* 62) odc txt [120] unlimited
|
||||
* 63) odc xhtml [120] unlimited
|
||||
* 64) odc xml [120] unlimited
|
||||
* 65) odi html [120] unlimited
|
||||
* 66) odi txt [120] unlimited
|
||||
* 67) odi xhtml [120] unlimited
|
||||
* 68) odi xml [120] unlimited
|
||||
* 69) odm html [120] unlimited
|
||||
* 70) odm txt [120] unlimited
|
||||
* 71) odm xhtml [120] unlimited
|
||||
* 72) odm xml [120] unlimited
|
||||
* 73) odp html [120] unlimited
|
||||
* 74) odp txt [120] unlimited
|
||||
* 75) odp xhtml [120] unlimited
|
||||
* 76) odp xml [120] unlimited
|
||||
* 77) ods html [120] unlimited
|
||||
* 78) ods txt [120] unlimited
|
||||
* 79) ods xhtml [120] unlimited
|
||||
* 80) ods xml [120] unlimited
|
||||
* 81) odt html [120] unlimited
|
||||
* 82) odt txt [120] unlimited
|
||||
* 83) odt xhtml [120] unlimited
|
||||
* 84) odt xml [120] unlimited
|
||||
* 85) ogx html [120] unlimited
|
||||
* 86) ogx txt [120] unlimited
|
||||
* 87) ogx xhtml [120] unlimited
|
||||
* 88) ogx xml [120] unlimited
|
||||
* 89) oth html [120] unlimited
|
||||
* 90) oth txt [120] unlimited
|
||||
* 91) oth xhtml [120] unlimited
|
||||
* 92) oth xml [120] unlimited
|
||||
* 93) otp html [120] unlimited
|
||||
* 94) otp txt [120] unlimited
|
||||
* 95) otp xhtml [120] unlimited
|
||||
* 96) otp xml [120] unlimited
|
||||
* 97) ots html [120] unlimited
|
||||
* 98) ots txt [120] unlimited
|
||||
* 99) ots xhtml [120] unlimited
|
||||
* 100) ots xml [120] unlimited
|
||||
* 101) ott html [120] unlimited
|
||||
* 102) ott txt [120] unlimited
|
||||
* 103) ott xhtml [120] unlimited
|
||||
* 104) ott xml [120] unlimited
|
||||
* 105) pages html [120] unlimited
|
||||
* 106) pages txt [120] unlimited
|
||||
* 107) pages xhtml [120] unlimited
|
||||
* 108) pages xml [120] unlimited
|
||||
* 109) pdf html [120] unlimited
|
||||
* 110) pdf txt [120] 25 MB
|
||||
* 111) pdf xhtml [120] unlimited
|
||||
* 112) pdf xml [120] unlimited
|
||||
* 113) potm html [120] unlimited
|
||||
* 114) potm txt [120] unlimited
|
||||
* 115) potm xhtml [120] unlimited
|
||||
* 116) potm xml [120] unlimited
|
||||
* 117) potx html [120] unlimited
|
||||
* 118) potx txt [120] unlimited
|
||||
* 119) potx xhtml [120] unlimited
|
||||
* 120) potx xml [120] unlimited
|
||||
* 121) ppam html [120] unlimited
|
||||
* 122) ppam txt [120] unlimited
|
||||
* 123) ppam xhtml [120] unlimited
|
||||
* 124) ppam xml [120] unlimited
|
||||
* 125) ppsm html [120] unlimited
|
||||
* 126) ppsm txt [120] unlimited
|
||||
* 127) ppsm xhtml [120] unlimited
|
||||
* 128) ppsm xml [120] unlimited
|
||||
* 129) ppsx html [120] unlimited
|
||||
* 130) ppsx txt [120] unlimited
|
||||
* 131) ppsx xhtml [120] unlimited
|
||||
* 132) ppsx xml [120] unlimited
|
||||
* 133) ppt html [120] unlimited
|
||||
* 134) ppt txt [120] unlimited
|
||||
* 135) ppt xhtml [120] unlimited
|
||||
* 136) ppt xml [120] unlimited
|
||||
* 137) pptm html [120] unlimited
|
||||
* 138) pptm txt [120] unlimited
|
||||
* 139) pptm xhtml [120] unlimited
|
||||
* 140) pptm xml [120] unlimited
|
||||
* 141) pptx html [120] unlimited
|
||||
* 142) pptx txt [120] unlimited
|
||||
* 143) pptx xhtml [120] unlimited
|
||||
* 144) pptx xml [120] unlimited
|
||||
* 145) rar html [120] unlimited
|
||||
* 146) rar txt [120] unlimited
|
||||
* 147) rar xhtml [120] unlimited
|
||||
* 148) rar xml [120] unlimited
|
||||
* 149) rss html [120] unlimited
|
||||
* 150) rss txt [120] unlimited
|
||||
* 151) rss xhtml [120] unlimited
|
||||
* 152) rss xml [120] unlimited
|
||||
* 153) rtf html [120] unlimited
|
||||
* 154) rtf txt [120] unlimited
|
||||
* 155) rtf xhtml [120] unlimited
|
||||
* 156) rtf xml [120] unlimited
|
||||
* 157) sldm html [120] unlimited
|
||||
* 158) sldm txt [120] unlimited
|
||||
* 159) sldm xhtml [120] unlimited
|
||||
* 160) sldm xml [120] unlimited
|
||||
* 161) sldx html [120] unlimited
|
||||
* 162) sldx txt [120] unlimited
|
||||
* 163) sldx xhtml [120] unlimited
|
||||
* 164) sldx xml [120] unlimited
|
||||
* 165) sxw html [120] unlimited
|
||||
* 166) sxw txt [120] unlimited
|
||||
* 167) sxw xhtml [120] unlimited
|
||||
* 168) sxw xml [120] unlimited
|
||||
* 169) txt html [120] unlimited
|
||||
* 170) txt txt [120] unlimited
|
||||
* 171) txt xhtml [120] unlimited
|
||||
* 172) txt xml [120] unlimited
|
||||
* 173) vsd html [120] unlimited
|
||||
* 174) vsd txt [120] unlimited
|
||||
* 175) vsd xhtml [120] unlimited
|
||||
* 176) vsd xml [120] unlimited
|
||||
* 177) xhtml html [120] unlimited
|
||||
* 178) xhtml txt [120] unlimited
|
||||
* 179) xhtml xhtml [120] unlimited
|
||||
* 180) xhtml xml [120] unlimited
|
||||
* 181) xlam html [120] unlimited
|
||||
* 182) xlam txt [120] unlimited
|
||||
* 183) xlam xhtml [120] unlimited
|
||||
* 184) xlam xml [120] unlimited
|
||||
* 185) xls html [120] unlimited
|
||||
* 186) xls txt [120] unlimited
|
||||
* 187) xls xhtml [120] unlimited
|
||||
* 188) xls xml [120] unlimited
|
||||
* 189) xlsb html [120] unlimited
|
||||
* 190) xlsb txt [120] unlimited
|
||||
* 191) xlsb xhtml [120] unlimited
|
||||
* 192) xlsb xml [120] unlimited
|
||||
* 193) xlsm html [120] unlimited
|
||||
* 194) xlsm txt [120] unlimited
|
||||
* 195) xlsm xhtml [120] unlimited
|
||||
* 196) xlsm xml [120] unlimited
|
||||
* 197) xlsx html [120] unlimited
|
||||
* 198) xlsx txt [120] unlimited
|
||||
* 199) xlsx xhtml [120] unlimited
|
||||
* 200) xlsx xml [120] unlimited
|
||||
* 201) xltm html [120] unlimited
|
||||
* 202) xltm txt [120] unlimited
|
||||
* 203) xltm xhtml [120] unlimited
|
||||
* 204) xltm xml [120] unlimited
|
||||
* 205) xltx html [120] unlimited
|
||||
* 206) xltx txt [120] unlimited
|
||||
* 207) xltx xhtml [120] unlimited
|
||||
* 208) xltx xml [120] unlimited
|
||||
* 209) xml html [120] unlimited
|
||||
* 210) xml txt [120] unlimited
|
||||
* 211) xml xhtml [120] unlimited
|
||||
* 212) xml xml [120] unlimited
|
||||
* 213) z html [120] unlimited
|
||||
* 214) z txt [120] unlimited
|
||||
* 215) z xhtml [120] unlimited
|
||||
* 216) z xml [120] unlimited
|
||||
* TextMining 0 ms
|
||||
* 1) doc html [130] unlimited
|
||||
* 2) doc txt [50] unlimited
|
||||
* 3) doc xhtml [130] unlimited
|
||||
* 4) doc xml [130] unlimited
|
||||
* </pre>
|
||||
*/
|
||||
public class Tika
|
||||
{
|
||||
public static final String ARCHIVE = "Archive";
|
||||
public static final String OUTLOOK_MSG = "OutlookMsg";
|
||||
public static final String PDF_BOX = "PdfBox";
|
||||
public static final String POI_OFFICE = "Office";
|
||||
public static final String POI = "Poi";
|
||||
public static final String POI_OO_XML = "OOXML";
|
||||
public static final String TIKA_AUTO = "TikaAuto";
|
||||
public static final String TEXT_MINING = "TextMining";
|
||||
|
||||
public static final List<String> TRANSFORM_NAMES = ImmutableList.of(
|
||||
ARCHIVE, OUTLOOK_MSG, PDF_BOX, POI_OFFICE, POI, POI_OO_XML, TIKA_AUTO, TEXT_MINING);
|
||||
|
||||
public static final String TARGET_MIMETYPE = "--targetMimetype=";
|
||||
public static final String TARGET_ENCODING = "--targetEncoding=";
|
||||
public static final String INCLUDE_CONTENTS = "--includeContents";
|
||||
public static final String NOT_EXTRACT_BOOKMARKS_TEXT = "--notExtractBookmarksText";
|
||||
|
||||
public static final String CSV = "csv";
|
||||
public static final String DOC = "doc";
|
||||
public static final String DOCX = "docx";
|
||||
public static final String HTML = "html";
|
||||
public static final String MSG = "msg";
|
||||
public static final String PDF = "pdf";
|
||||
public static final String PPTX = "pptx";
|
||||
public static final String TXT = "txt";
|
||||
public static final String XHTML = "xhtml";
|
||||
public static final String XSLX = "xslx";
|
||||
public static final String XML = "xml";
|
||||
public static final String ZIP = "zip";
|
||||
|
||||
private final Parser packageParser = new PackageParser();
|
||||
private final Parser pdfParser = new PDFParser();
|
||||
private final Parser officeParser = new OfficeParser();
|
||||
private final Parser autoDetectParser;
|
||||
private final Parser ooXmlParser = new OOXMLParser();
|
||||
private final Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
|
||||
private final PDFParserConfig pdfParserConfig = new PDFParserConfig();
|
||||
|
||||
public static final DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
|
||||
{
|
||||
private final List<String> disabledMediaTypes = ImmutableList.of(MIMETYPE_IMAGE_JPEG,
|
||||
MIMETYPE_IMAGE_TIFF, MIMETYPE_IMAGE_PNG);
|
||||
|
||||
@Override
|
||||
public boolean select(Metadata metadata)
|
||||
{
|
||||
String contentType = metadata.get(Metadata.CONTENT_TYPE);
|
||||
if (contentType == null || contentType.equals("") || disabledMediaTypes == null)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return !disabledMediaTypes.contains(contentType);
|
||||
}
|
||||
};
|
||||
|
||||
public Tika() throws TikaException, IOException, SAXException
|
||||
{
|
||||
TikaConfig tikaConfig = readTikaConfig();
|
||||
autoDetectParser = new AutoDetectParser(tikaConfig);
|
||||
}
|
||||
|
||||
public static TikaConfig readTikaConfig(Logger logger)
|
||||
{
|
||||
try
|
||||
{
|
||||
return readTikaConfig();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.error("Failed to read tika-config.xml", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static TikaConfig readTikaConfig() throws TikaException, IOException, SAXException
|
||||
{
|
||||
ClassLoader classLoader = Tika.class.getClassLoader();
|
||||
URL tikaConfigXml = classLoader.getResource("tika-config.xml");
|
||||
return new TikaConfig(tikaConfigXml);
|
||||
}
|
||||
|
||||
// Method included for developer testing
|
||||
public static void main(String[] args)
|
||||
{
|
||||
long start = System.currentTimeMillis();
|
||||
try
|
||||
{
|
||||
new Tika().transform(args);
|
||||
}
|
||||
catch (IllegalArgumentException e)
|
||||
{
|
||||
System.err.println("ERROR " + e.getMessage());
|
||||
System.exit(-1);
|
||||
}
|
||||
catch (IllegalStateException | TikaException | IOException | SAXException e)
|
||||
{
|
||||
System.err.println("ERROR " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
System.exit(-2);
|
||||
}
|
||||
System.out.println("Finished in " + (System.currentTimeMillis() - start) + "ms");
|
||||
}
|
||||
|
||||
// Extracts parameters form args
|
||||
public void transform(String[] args)
|
||||
{
|
||||
String transform = null;
|
||||
String targetMimetype = null;
|
||||
String targetEncoding = null;
|
||||
String sourceFilename = null;
|
||||
String targetFilename = null;
|
||||
Boolean includeContents = null;
|
||||
Boolean notExtractBookmarksText = null;
|
||||
|
||||
for (String arg : args)
|
||||
{
|
||||
if (arg.startsWith("--"))
|
||||
{
|
||||
if (INCLUDE_CONTENTS.startsWith(arg))
|
||||
{
|
||||
getValue(arg, false, includeContents, INCLUDE_CONTENTS);
|
||||
includeContents = true;
|
||||
}
|
||||
else if (arg.startsWith(TARGET_ENCODING))
|
||||
{
|
||||
targetEncoding = getValue(arg, true, targetEncoding, TARGET_ENCODING);
|
||||
}
|
||||
else if (arg.startsWith(TARGET_MIMETYPE))
|
||||
{
|
||||
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
|
||||
}
|
||||
else if (arg.startsWith(NOT_EXTRACT_BOOKMARKS_TEXT))
|
||||
{
|
||||
getValue(arg, false, notExtractBookmarksText, NOT_EXTRACT_BOOKMARKS_TEXT);
|
||||
notExtractBookmarksText = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected argument " + arg);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (transform == null)
|
||||
{
|
||||
transform = arg;
|
||||
}
|
||||
else if (sourceFilename == null)
|
||||
{
|
||||
sourceFilename = arg;
|
||||
}
|
||||
else if (targetFilename == null)
|
||||
{
|
||||
targetFilename = arg;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected argument " + arg);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (targetFilename == null)
|
||||
{
|
||||
throw new IllegalArgumentException("Missing arguments");
|
||||
}
|
||||
includeContents = includeContents == null ? false : includeContents;
|
||||
notExtractBookmarksText = notExtractBookmarksText == null ? false : notExtractBookmarksText;
|
||||
|
||||
transform(transform, includeContents, notExtractBookmarksText, sourceFilename,
|
||||
targetFilename, targetMimetype, targetEncoding);
|
||||
}
|
||||
|
||||
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
|
||||
{
|
||||
if (value != null)
|
||||
{
|
||||
throw new IllegalArgumentException("Duplicate " + optionName);
|
||||
}
|
||||
String stringValue = arg.substring(optionName.length()).trim();
|
||||
if (!valueExpected && stringValue.length() > 0)
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected value with " + optionName);
|
||||
}
|
||||
if (valueExpected && stringValue.length() == 0)
|
||||
{
|
||||
throw new IllegalArgumentException("Expected value with " + optionName);
|
||||
}
|
||||
return stringValue;
|
||||
}
|
||||
|
||||
// Adds transform specific values such as parser and documentSelector.
|
||||
private void transform(String transform, Boolean includeContents,
|
||||
Boolean notExtractBookmarksText,
|
||||
String sourceFilename,
|
||||
String targetFilename, String targetMimetype, String targetEncoding)
|
||||
{
|
||||
Parser parser = null;
|
||||
DocumentSelector documentSelector = null;
|
||||
|
||||
switch (transform)
|
||||
{
|
||||
case ARCHIVE:
|
||||
parser = packageParser;
|
||||
break;
|
||||
case OUTLOOK_MSG:
|
||||
case POI_OFFICE:
|
||||
case TEXT_MINING:
|
||||
parser = officeParser;
|
||||
break;
|
||||
case PDF_BOX:
|
||||
parser = pdfParser;
|
||||
documentSelector = pdfBoxEmbededDocumentSelector;
|
||||
break;
|
||||
case POI:
|
||||
parser = tikaOfficeDetectParser;
|
||||
break;
|
||||
case POI_OO_XML:
|
||||
parser = ooXmlParser;
|
||||
break;
|
||||
case TIKA_AUTO:
|
||||
parser = autoDetectParser;
|
||||
break;
|
||||
}
|
||||
|
||||
transform(parser, documentSelector, includeContents, notExtractBookmarksText,
|
||||
sourceFilename, targetFilename, targetMimetype, targetEncoding);
|
||||
}
|
||||
|
||||
private void transform(Parser parser, DocumentSelector documentSelector,
|
||||
Boolean includeContents,
|
||||
Boolean notExtractBookmarksText,
|
||||
String sourceFilename,
|
||||
String targetFilename, String targetMimetype, String targetEncoding)
|
||||
{
|
||||
|
||||
try (InputStream is = new BufferedInputStream(new FileInputStream(sourceFilename));
|
||||
OutputStream os = new FileOutputStream(targetFilename);
|
||||
Writer ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding)))
|
||||
{
|
||||
Metadata metadata = new Metadata();
|
||||
ParseContext context = buildParseContext(documentSelector, includeContents,
|
||||
notExtractBookmarksText);
|
||||
ContentHandler handler = getContentHandler(targetMimetype, ow);
|
||||
|
||||
parser.parse(is, handler, metadata, context);
|
||||
}
|
||||
catch (SAXException | TikaException | IOException e)
|
||||
{
|
||||
throw new IllegalStateException(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private ContentHandler getContentHandler(String targetMimetype, Writer output)
|
||||
{
|
||||
try
|
||||
{
|
||||
ContentHandler handler;
|
||||
if (MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
||||
{
|
||||
handler = new BodyContentHandler(output);
|
||||
}
|
||||
else
|
||||
{
|
||||
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
|
||||
TransformerHandler transformerHandler;
|
||||
transformerHandler = factory.newTransformerHandler();
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
transformerHandler.setResult(new StreamResult(output));
|
||||
handler = transformerHandler;
|
||||
|
||||
if (MIMETYPE_HTML.equals(targetMimetype))
|
||||
{
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, HTML);
|
||||
return new ExpandedTitleContentHandler(transformerHandler);
|
||||
}
|
||||
else if (MIMETYPE_XHTML.equals(targetMimetype) ||
|
||||
MIMETYPE_XML.equals(targetMimetype))
|
||||
{
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, XML);
|
||||
}
|
||||
else if (MIMETYPE_TEXT_CSV.equals(targetMimetype))
|
||||
{
|
||||
handler = new CsvContentHandler(output);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Invalid target mimetype " + targetMimetype);
|
||||
}
|
||||
}
|
||||
return handler;
|
||||
}
|
||||
catch (TransformerConfigurationException e)
|
||||
{
|
||||
throw new IllegalStateException(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A wrapper around the normal Tika BodyContentHandler for CSV rather encoding than tab separated.
|
||||
*/
|
||||
protected static class CsvContentHandler extends BodyContentHandler
|
||||
{
|
||||
private static final char[] comma = new char[]{','};
|
||||
private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+");
|
||||
|
||||
private boolean inCell = false;
|
||||
private boolean needsComma = false;
|
||||
|
||||
protected CsvContentHandler(Writer output)
|
||||
{
|
||||
super(output);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void ignorableWhitespace(char[] ch, int start, int length)
|
||||
throws SAXException
|
||||
{
|
||||
if (length == 1 && ch[0] == '\t')
|
||||
{
|
||||
// Ignore tabs, as they mess up the CSV output
|
||||
}
|
||||
else
|
||||
{
|
||||
super.ignorableWhitespace(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length)
|
||||
throws SAXException
|
||||
{
|
||||
if (inCell)
|
||||
{
|
||||
StringBuffer t = new StringBuffer(new String(ch, start, length));
|
||||
|
||||
// Quote if not all numbers
|
||||
if (all_nums.matcher(t).matches())
|
||||
{
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = t.length() - 1; i >= 0; i--)
|
||||
{
|
||||
if (t.charAt(i) == '\"')
|
||||
{
|
||||
// Double up double quotes
|
||||
t.insert(i, '\"');
|
||||
i--;
|
||||
}
|
||||
}
|
||||
t.insert(0, '\"');
|
||||
t.append('\"');
|
||||
char[] c = t.toString().toCharArray();
|
||||
super.characters(c, 0, c.length);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String name,
|
||||
Attributes atts) throws SAXException
|
||||
{
|
||||
if (localName.equals("td"))
|
||||
{
|
||||
inCell = true;
|
||||
if (needsComma)
|
||||
{
|
||||
super.characters(comma, 0, 1);
|
||||
needsComma = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
super.startElement(uri, localName, name, atts);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String name)
|
||||
throws SAXException
|
||||
{
|
||||
if (localName.equals("td"))
|
||||
{
|
||||
needsComma = true;
|
||||
inCell = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (localName.equals("tr"))
|
||||
{
|
||||
needsComma = false;
|
||||
}
|
||||
super.endElement(uri, localName, name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private ParseContext buildParseContext(DocumentSelector documentSelector,
|
||||
Boolean includeContents, Boolean notExtractBookmarksText)
|
||||
{
|
||||
ParseContext context = new ParseContext();
|
||||
|
||||
if (documentSelector != null)
|
||||
{
|
||||
context.set(DocumentSelector.class, documentSelector);
|
||||
}
|
||||
|
||||
if (notExtractBookmarksText.equals(true))
|
||||
{
|
||||
pdfParserConfig.setExtractBookmarksText(false);
|
||||
// pdfParserConfig is set to override default settings
|
||||
context.set(PDFParserConfig.class, pdfParserConfig);
|
||||
}
|
||||
|
||||
// If Archive transform
|
||||
if (includeContents != null)
|
||||
{
|
||||
context.set(Parser.class, includeContents ? autoDetectParser : new EmptyParser());
|
||||
}
|
||||
|
||||
return context;
|
||||
}
|
||||
}
|
@@ -1,204 +0,0 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.executors;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.alfresco.transformer.logging.LogEntry;
|
||||
import org.alfresco.transformer.metadataExtractors.AbstractTikaMetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.DWGMetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.MP3MetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.MailMetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.OfficeMetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.OpenDocumentMetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.PdfBoxMetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.PoiMetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.TikaAudioMetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.TikaAutoMetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.IPTCMetadataExtractor;
|
||||
import org.alfresco.transformer.util.RequestParamMap;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import static java.lang.Boolean.parseBoolean;
|
||||
import static org.alfresco.transformer.executors.Tika.INCLUDE_CONTENTS;
|
||||
import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING;
|
||||
import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE;
|
||||
|
||||
/**
|
||||
* JavaExecutor implementation for running TIKA transformations. It loads the
|
||||
* transformation logic in the same JVM (check {@link Tika}).
|
||||
*/
|
||||
public class TikaJavaExecutor implements JavaExecutor
|
||||
{
|
||||
private boolean notExtractBookmarksTextDefault;
|
||||
|
||||
private static final String ID = "tika";
|
||||
|
||||
public static final String LICENCE =
|
||||
"This transformer uses Tika from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt\n" +
|
||||
"This transformer uses ExifTool by Phil Harvey. See license at https://exiftool.org/#license. or in /Perl-Artistic-License.txt";
|
||||
|
||||
private final Tika tika;
|
||||
private final Map<String, AbstractTikaMetadataExtractor> metadataExtractor = ImmutableMap
|
||||
.<String, AbstractTikaMetadataExtractor>builder()
|
||||
.put("DWGMetadataExtractor", new DWGMetadataExtractor())
|
||||
.put("MailMetadataExtractor", new MailMetadataExtractor())
|
||||
.put("MP3MetadataExtractor", new MP3MetadataExtractor())
|
||||
.put("OfficeMetadataExtractor", new OfficeMetadataExtractor())
|
||||
.put("OpenDocumentMetadataExtractor", new OpenDocumentMetadataExtractor())
|
||||
.put("PdfBoxMetadataExtractor", new PdfBoxMetadataExtractor())
|
||||
.put("PoiMetadataExtractor", new PoiMetadataExtractor())
|
||||
.put("TikaAudioMetadataExtractor", new TikaAudioMetadataExtractor())
|
||||
.put("TikaAutoMetadataExtractor", new TikaAutoMetadataExtractor())
|
||||
.put("IPTCMetadataExtractor", new IPTCMetadataExtractor())
|
||||
.build();
|
||||
private final Map<String, AbstractTikaMetadataExtractor> metadataEmbedder = ImmutableMap
|
||||
.<String, AbstractTikaMetadataExtractor>builder()
|
||||
.put("SamplePoiMetadataEmbedder", new PoiMetadataExtractor())
|
||||
.build();
|
||||
|
||||
public TikaJavaExecutor(boolean notExtractBookmarksTextDefault)
|
||||
{
|
||||
this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault;
|
||||
try
|
||||
{
|
||||
tika = new Tika();
|
||||
}
|
||||
catch (SAXException | IOException | TikaException e)
|
||||
{
|
||||
throw new RuntimeException("Unable to instantiate Tika: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public TikaJavaExecutor()
|
||||
{
|
||||
this(false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getTransformerId()
|
||||
{
|
||||
return ID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transform(String transformName, String sourceMimetype, String targetMimetype,
|
||||
Map<String, String> transformOptions, File sourceFile, File targetFile)
|
||||
throws Exception
|
||||
{
|
||||
final boolean includeContents = parseBoolean(
|
||||
transformOptions.getOrDefault(RequestParamMap.INCLUDE_CONTENTS, "false"));
|
||||
final boolean notExtractBookmarksText = parseBoolean(
|
||||
transformOptions.getOrDefault(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT, String.valueOf(notExtractBookmarksTextDefault)));
|
||||
final String targetEncoding = transformOptions.getOrDefault("targetEncoding", "UTF-8");
|
||||
if(transformOptions.get(RequestParamMap.NOT_EXTRACT_BOOKMARKS_TEXT)==null && notExtractBookmarksTextDefault)
|
||||
{
|
||||
LoggerFactory.getLogger(TikaJavaExecutor.class).trace(
|
||||
"notExtractBookmarksText default value has been overridden to {}",
|
||||
notExtractBookmarksTextDefault);
|
||||
}
|
||||
call(sourceFile, targetFile, transformName,
|
||||
includeContents ? INCLUDE_CONTENTS : null,
|
||||
notExtractBookmarksText ? Tika.NOT_EXTRACT_BOOKMARKS_TEXT : null,
|
||||
TARGET_MIMETYPE + targetMimetype, TARGET_ENCODING + targetEncoding);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void call(File sourceFile, File targetFile, String... args)
|
||||
{
|
||||
args = buildArgs(sourceFile, targetFile, args);
|
||||
tika.transform(args);
|
||||
}
|
||||
|
||||
private static String[] buildArgs(File sourceFile, File targetFile, String[] args)
|
||||
{
|
||||
ArrayList<String> methodArgs = new ArrayList<>(args.length + 2);
|
||||
StringJoiner sj = new StringJoiner(" ");
|
||||
for (String arg : args)
|
||||
{
|
||||
addArg(methodArgs, sj, arg);
|
||||
}
|
||||
|
||||
addFileArg(methodArgs, sj, sourceFile);
|
||||
addFileArg(methodArgs, sj, targetFile);
|
||||
|
||||
LogEntry.setOptions(sj.toString());
|
||||
|
||||
return methodArgs.toArray(new String[0]);
|
||||
}
|
||||
|
||||
private static void addArg(ArrayList<String> methodArgs, StringJoiner sj, String arg)
|
||||
{
|
||||
if (arg != null)
|
||||
{
|
||||
sj.add(arg);
|
||||
methodArgs.add(arg);
|
||||
}
|
||||
}
|
||||
|
||||
private static void addFileArg(ArrayList<String> methodArgs, StringJoiner sj, File arg)
|
||||
{
|
||||
if (arg != null)
|
||||
{
|
||||
String path = arg.getAbsolutePath();
|
||||
int i = path.lastIndexOf('.');
|
||||
String ext = i == -1 ? "???" : path.substring(i + 1);
|
||||
sj.add(ext);
|
||||
methodArgs.add(path);
|
||||
}
|
||||
}
|
||||
|
||||
public void extractMetadata(String transformName, String sourceMimetype, String targetMimetype,
|
||||
Map<String, String> transformOptions, File sourceFile, File targetFile)
|
||||
throws Exception
|
||||
{
|
||||
AbstractTikaMetadataExtractor metadataExtractor = this.metadataExtractor.get(transformName);
|
||||
metadataExtractor.extractMetadata(sourceMimetype, transformOptions, sourceFile, targetFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated The content repository's TikaPoweredMetadataExtracter provides no non test implementations.
|
||||
* This code exists in case there are custom implementations, that need to be converted to T-Engines.
|
||||
* It is simply a copy and paste from the content repository and has received limited testing.
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("deprecation" )
|
||||
public void embedMetadata(String transformName, String sourceMimetype, String targetMimetype,
|
||||
Map<String, String> transformOptions, File sourceFile, File targetFile)
|
||||
throws Exception
|
||||
{
|
||||
AbstractTikaMetadataExtractor metadataExtractor = this.metadataEmbedder.get(transformName);
|
||||
metadataExtractor.embedMetadata(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
|
||||
}
|
||||
}
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,14 +24,14 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
package org.alfresco.transform.tika.metadataExtractors;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class IPTCMetadataExtractorTest {
|
||||
|
||||
public class IPTCMetadataExtractorTest
|
||||
{
|
||||
IPTCMetadataExtractor extractor = new IPTCMetadataExtractor();
|
||||
|
||||
@Test
|
||||
@@ -42,7 +42,5 @@ public class IPTCMetadataExtractorTest {
|
||||
"1901-02-01T00:00:00.000Z", "1901-02-01T00:00+00:00", "1901-02-01T00:00+00:00" };
|
||||
|
||||
assertArrayEquals(expected, extractor.iptcToIso8601DateStrings(testStrings));
|
||||
|
||||
}
|
||||
|
||||
}
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,7 +24,7 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.tika.parsers;
|
||||
package org.alfresco.transform.tika.parsers;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2021 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2022 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -24,11 +24,11 @@
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.executors;
|
||||
package org.alfresco.transform.tika.transformers;
|
||||
|
||||
import static org.alfresco.transformer.executors.Tika.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
import static org.alfresco.transformer.executors.Tika.TARGET_ENCODING;
|
||||
import static org.alfresco.transformer.executors.Tika.TARGET_MIMETYPE;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.NOT_EXTRACT_BOOKMARKS_TEXT;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TARGET_ENCODING;
|
||||
import static org.alfresco.transform.tika.transformers.Tika.TARGET_MIMETYPE;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.clearInvocations;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
@@ -41,19 +41,33 @@ import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class TikaJavaExecutorTest {
|
||||
public class GenericTikaTransformerTest
|
||||
{
|
||||
private class TikaTestTransformer extends GenericTikaTransformer
|
||||
{
|
||||
@Override
|
||||
protected Parser getParser()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
TikaTestTransformer(boolean notExtractBookmarksTextDefault)
|
||||
{
|
||||
this.notExtractBookmarksTextDefault = notExtractBookmarksTextDefault;
|
||||
}
|
||||
};
|
||||
|
||||
@Test
|
||||
public void testNotExtractBookmarkTextDefault() throws Exception
|
||||
{
|
||||
TikaJavaExecutor executorSpyDefaultTrue = spy(new TikaJavaExecutor(true));
|
||||
TikaJavaExecutor executorSpyDefaultFalse = spy(new TikaJavaExecutor(false));
|
||||
GenericTikaTransformer executorSpyDefaultTrue = spy(new TikaTestTransformer(true));
|
||||
GenericTikaTransformer executorSpyDefaultFalse = spy(new TikaTestTransformer(false));
|
||||
|
||||
File mockSourceFile = mock(File.class);
|
||||
File mockTargetFile = mock(File.class);
|
||||
@@ -66,7 +80,7 @@ public class TikaJavaExecutorTest {
|
||||
lenient().doNothing().when(executorSpyDefaultTrue).call(any(), any(), any(), any(), any(), any(), any());
|
||||
lenient().doNothing().when(executorSpyDefaultFalse).call(any(), any(), any(), any(), any(), any(), any());
|
||||
|
||||
Map<String, String> transformOptions = new HashMap<String,String>();
|
||||
Map<String, String> transformOptions = new HashMap<>();
|
||||
|
||||
// use empty transformOptions to test defaults
|
||||
executorSpyDefaultTrue.transform(transformName, sourceMimetype, targetMimetype, transformOptions,
|
Reference in New Issue
Block a user