mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-05-26 17:24:47 +00:00
REPO-3425 Transformers: Tika based transformers
This commit is contained in:
parent
c9ced17097
commit
82c5e3e96a
@ -3,7 +3,7 @@
|
||||
# The container is only intended to be used with the Alfresco Enterprise editon which is covered by https://www.alfresco.com/legal/agreements and https://www.alfresco.com/terms-use.
|
||||
# alfresco-pdf-renderer uses the PDFium library from Google Inc. See the license at https://pdfium.googlesource.com/pdfium/+/master/LICENSE or in /pdfium.txt.
|
||||
|
||||
FROM quay.io/alfresco/alfresco-base-java:9
|
||||
FROM alfresco/alfresco-base-java:8
|
||||
|
||||
ENV ALFRESCO_PDF_RENDERER_LIB_RPM_URL=https://nexus.alfresco.com/nexus/service/local/repositories/releases/content/org/alfresco/alfresco-pdf-renderer/1.1/alfresco-pdf-renderer-1.1-linux.tgz
|
||||
ENV PDFIUM_LICENSE_FILE=https://github.com/Alfresco/acs-community-packaging/blob/master/distribution/src/main/resources/licenses/3rd-party/pdfium.txt
|
||||
|
@ -155,16 +155,4 @@ public class AlfrescoPdfRendererController extends AbstractTransformerController
|
||||
|
||||
return createAttachment(targetFilename, targetFile, testDelay);
|
||||
}
|
||||
|
||||
private void executeTransformCommand(String options, File sourceFile, File targetFile, @RequestParam(value = "timeout", required = false) Long timeout)
|
||||
{
|
||||
LogEntry.setOptions(options);
|
||||
|
||||
Map<String, String> properties = new HashMap<String, String>(5);
|
||||
properties.put("options", options);
|
||||
properties.put("source", sourceFile.getAbsolutePath());
|
||||
properties.put("target", targetFile.getAbsolutePath());
|
||||
|
||||
executeTransformCommand(properties, targetFile, timeout);
|
||||
}
|
||||
}
|
||||
|
@ -51,7 +51,7 @@ public class AlfrescoPdfRendererControllerTest extends AbstractTransformerContro
|
||||
@Before
|
||||
public void before() throws IOException
|
||||
{
|
||||
super.mockTransformCommand(controller, "pdf", "png", "application/pdf");
|
||||
super.mockTransformCommand(controller, "pdf", "png", "application/pdf", true);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -3,7 +3,7 @@
|
||||
# The container is only intended to be used with the Alfresco Enterprise editon which is covered by https://www.alfresco.com/legal/agreements and https://www.alfresco.com/terms-use.
|
||||
# ImageMagick is from ImageMagick Studio LLC. See the license at http://www.imagemagick.org/script/license.php or in /ImageMagick-license.txt.
|
||||
|
||||
FROM quay.io/alfresco/alfresco-base-java:9
|
||||
FROM alfresco/alfresco-base-java:8
|
||||
|
||||
ENV IMAGEMAGICK_RPM_URL=https://nexus.alfresco.com/nexus/service/local/repositories/thirdparty/content/org/imagemagick/imagemagick-distribution/7.0.7-27/imagemagick-distribution-7.0.7-27-linux.rpm
|
||||
ENV IMAGEMAGICK_LIB_RPM_URL=https://nexus.alfresco.com/nexus/service/local/repositories/thirdparty/content/org/imagemagick/imagemagick-distribution/7.0.7-27/imagemagick-distribution-7.0.7-27-libs-linux.rpm
|
||||
|
@ -51,7 +51,7 @@ public class ImageMagickControllerTest extends AbstractTransformerControllerTest
|
||||
@Before
|
||||
public void before() throws IOException
|
||||
{
|
||||
super.mockTransformCommand(controller, "jpg", "png", "image/jpg");
|
||||
super.mockTransformCommand(controller, "jpg", "png", "image/jpg", true);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -3,7 +3,7 @@
|
||||
# The container is only intended to be used with the Alfresco Enterprise editon which is covered by https://www.alfresco.com/legal/agreements and https://www.alfresco.com/terms-use.
|
||||
# LibreOffice is from The Document Foundation. See the license at https://www.libreoffice.org/download/license/ or in /libreoffice.txt.
|
||||
|
||||
FROM quay.io/alfresco/alfresco-base-java:9
|
||||
FROM alfresco/alfresco-base-java:8
|
||||
|
||||
ENV LIBREOFFICE_RPM_URL=https://nexus.alfresco.com/nexus/service/local/repositories/thirdparty/content/org/libreoffice/libreoffice-dist/5.4.6/libreoffice-dist-5.4.6-linux.gz
|
||||
ENV LIBREOFFICE_LICENSE_FILE=https://github.com/Alfresco/acs-community-packaging/blob/master/distribution/src/main/resources/licenses/3rd-party/libreoffice.txt
|
||||
|
1
alfresco-docker-tika/.maven-dockerignore
Normal file
1
alfresco-docker-tika/.maven-dockerignore
Normal file
@ -0,0 +1 @@
|
||||
target/docker/
|
20
alfresco-docker-tika/Dockerfile
Normal file
20
alfresco-docker-tika/Dockerfile
Normal file
@ -0,0 +1,20 @@
|
||||
# Image provides a container in which to run Tika transformations for Alfresco Enterprise Content Services.
|
||||
|
||||
# The container is only intended to be used with the Alfresco Enterprise editon which is covered by https://www.alfresco.com/legal/agreements and https://www.alfresco.com/terms-use.
|
||||
# Tika is from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
|
||||
FROM alfresco/alfresco-base-java:8
|
||||
|
||||
ENV APACHE_LICENSE_FILE=https://github.com/Alfresco/acs-community-packaging/blob/master/distribution/src/main/resources/licenses/3rd-party/Apache%202.0.txt
|
||||
|
||||
COPY target/alfresco-docker-tika-${env.project_version}.jar /usr/bin
|
||||
|
||||
RUN ln /usr/bin/alfresco-docker-tika-${env.project_version}.jar /usr/bin/alfresco-docker-tika.jar && \
|
||||
yum install -y wget && \
|
||||
wget $APACHE_LICENSE_FILE && \
|
||||
yum remove -y wget && \
|
||||
yum clean all
|
||||
|
||||
EXPOSE 8090
|
||||
|
||||
ENTRYPOINT java -jar /usr/bin/alfresco-docker-tika.jar
|
7
alfresco-docker-tika/LICENSES.md
Normal file
7
alfresco-docker-tika/LICENSES.md
Normal file
@ -0,0 +1,7 @@
|
||||
### Licenses
|
||||
|
||||
* The code in the alfresco-docker-imagemagick project is only intended to be use with the Alfresco Enterprise
|
||||
Content Repository which is covered by [https://www.alfresco.com/legal/agreements](https://www.alfresco.com/legal/agreements) and [https://www.alfresco.com/terms-use](https://www.alfresco.com/terms-use)
|
||||
* Tika is from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0 or the
|
||||
[Apache 2.0.txt](https://github.com/Alfresco/acs-community-packaging/blob/master/distribution/src/main/resources/licenses/3rd-party/Apache%202.0.txt)
|
||||
file placed in the root directory of the docker image.
|
268
alfresco-docker-tika/pom.xml
Normal file
268
alfresco-docker-tika/pom.xml
Normal file
@ -0,0 +1,268 @@
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>alfresco-docker-tika</artifactId>
|
||||
<name>Alfresco Docker Tika</name>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<parent>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<artifactId>alfresco-docker-transformers</artifactId>
|
||||
<version>1.2-SNAPSHOT</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
<properties>
|
||||
<image.name>alfresco/alfresco-tika</image.name>
|
||||
<image.registry>quay.io</image.registry>
|
||||
<dependency.poi.version>3.17</dependency.poi.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<artifactId>alfresco-transformer-base</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<artifactId>alfresco-transformer-base</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<classifier>tests</classifier>
|
||||
<type>test-jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-thymeleaf</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-test</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<artifactId>alfresco-core</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<artifactId>alfresco-data-model</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<version>1.7.24</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<version>1.7.25</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Tika -->
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-core</artifactId>
|
||||
<version>1.17-20180201-alfresco-patched</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-parsers</artifactId>
|
||||
<version>1.17-20180201-alfresco-patched</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>com.tdunning</groupId>
|
||||
<artifactId>json</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<!-- Apache POI -->
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi</artifactId>
|
||||
<version>${dependency.poi.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml</artifactId>
|
||||
<version>${dependency.poi.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-scratchpad</artifactId>
|
||||
<version>${dependency.poi.version}</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||
<version>${dependency.spring-boot.version}</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>repackage</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>io.fabric8</groupId>
|
||||
<artifactId>fabric8-maven-plugin</artifactId>
|
||||
<version>${dependency.fabric8.version}</version>
|
||||
<configuration>
|
||||
<images>
|
||||
<image>
|
||||
<name>${image.name}:${image.tag}</name>
|
||||
<build>
|
||||
<dockerFileDir>${project.basedir}/</dockerFileDir>
|
||||
</build>
|
||||
</image>
|
||||
</images>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>enterpriseDocker</id>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>io.fabric8</groupId>
|
||||
<artifactId>fabric8-maven-plugin</artifactId>
|
||||
<version>${dependency.fabric8.version}</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>build-image</id>
|
||||
<phase>install</phase>
|
||||
<goals>
|
||||
<goal>build</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>internal</id>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>io.fabric8</groupId>
|
||||
<artifactId>fabric8-maven-plugin</artifactId>
|
||||
<version>${dependency.fabric8.version}</version>
|
||||
<configuration>
|
||||
<images>
|
||||
<image>
|
||||
<name>${image.registry}/${image.name}:${image.tag}</name>
|
||||
<build>
|
||||
<dockerFileDir>${project.basedir}/</dockerFileDir>
|
||||
</build>
|
||||
</image>
|
||||
</images>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>build-push-image</id>
|
||||
<phase>install</phase>
|
||||
<goals>
|
||||
<goal>build</goal>
|
||||
<goal>push</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>master</id>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>io.fabric8</groupId>
|
||||
<artifactId>fabric8-maven-plugin</artifactId>
|
||||
<version>${dependency.fabric8.version}</version>
|
||||
<configuration>
|
||||
<images>
|
||||
<image>
|
||||
<name>${image.registry}/${image.name}</name>
|
||||
<build>
|
||||
<dockerFileDir>${project.basedir}/</dockerFileDir>
|
||||
</build>
|
||||
</image>
|
||||
<image>
|
||||
<name>${image.name}</name>
|
||||
<build>
|
||||
<dockerFileDir>${project.basedir}/</dockerFileDir>
|
||||
</build>
|
||||
</image>
|
||||
</images>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>build-push-image</id>
|
||||
<phase>install</phase>
|
||||
<goals>
|
||||
<goal>build</goal>
|
||||
<goal>push</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>release</id>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>io.fabric8</groupId>
|
||||
<artifactId>fabric8-maven-plugin</artifactId>
|
||||
<version>${dependency.fabric8.version}</version>
|
||||
<configuration>
|
||||
<images>
|
||||
<image>
|
||||
<name>${image.name}:${project.version}</name>
|
||||
<registry>${image.registry}</registry>
|
||||
<build>
|
||||
<dockerFileDir>${project.basedir}/</dockerFileDir>
|
||||
</build>
|
||||
</image>
|
||||
<image>
|
||||
<name>${image.name}:${project.version}</name>
|
||||
<build>
|
||||
<dockerFileDir>${project.basedir}/</dockerFileDir>
|
||||
</build>
|
||||
</image>
|
||||
</images>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>build-push-image</id>
|
||||
<phase>deploy</phase>
|
||||
<goals>
|
||||
<goal>build</goal>
|
||||
<goal>push</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
</profiles>
|
||||
</project>
|
@ -0,0 +1,27 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Enterprise Repository
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2018 Alfresco Software Limited
|
||||
* %%
|
||||
* License rights for this program may be obtained from Alfresco Software, Ltd.
|
||||
* pursuant to a written agreement and any use of this program without such an
|
||||
* agreement is prohibited.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
|
||||
|
||||
@SpringBootApplication
|
||||
@EnableAutoConfiguration(exclude={DataSourceAutoConfiguration.class})
|
||||
public class Application
|
||||
{
|
||||
public static void main(String[] args)
|
||||
{
|
||||
SpringApplication.run(Application.class, args);
|
||||
}
|
||||
}
|
@ -0,0 +1,801 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Enterprise Repository
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2018 Alfresco Software Limited
|
||||
* %%
|
||||
* License rights for this program may be obtained from Alfresco Software, Ltd.
|
||||
* pursuant to a written agreement and any use of this program without such an
|
||||
* agreement is prohibited.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.extractor.DocumentSelector;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.EmptyParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.apache.tika.parser.pdf.PDFParser;
|
||||
import org.apache.tika.parser.pkg.PackageParser;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.apache.tika.sax.ExpandedTitleContentHandler;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.TransformerConfigurationException;
|
||||
import javax.xml.transform.sax.SAXTransformerFactory;
|
||||
import javax.xml.transform.sax.TransformerHandler;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.alfresco.repo.content.MimetypeMap.*;
|
||||
|
||||
/**
|
||||
* Stripped down command line Tika transformers. Not actually run as a separate process, but the code fits the patten
|
||||
* used by transformers that do.
|
||||
* <pre>
|
||||
*
|
||||
* Archive 0 ms
|
||||
* 1) cpio html [100] unlimited
|
||||
* 2) cpio txt [50] unlimited
|
||||
* 3) cpio xhtml [100] unlimited
|
||||
* 4) cpio xml [100] unlimited
|
||||
* 5) jar html [100] unlimited
|
||||
* 6) jar txt [50] unlimited
|
||||
* 7) jar xhtml [100] unlimited
|
||||
* 8) jar xml [100] unlimited
|
||||
* 9) tar html [100] unlimited
|
||||
* 10) tar txt [50] unlimited
|
||||
* 11) tar xhtml [100] unlimited
|
||||
* 12) tar xml [100] unlimited
|
||||
* 13) zip html [100] unlimited
|
||||
* 14) zip txt [50] unlimited
|
||||
* 15) zip xhtml [100] unlimited
|
||||
* 16) zip xml [100] unlimited
|
||||
* PdfBox 0 ms
|
||||
* 1) pdf html [110] unlimited
|
||||
* 2) pdf txt [50] 25 MB
|
||||
* 3) pdf xhtml [110] unlimited
|
||||
* 4) pdf xml [110] unlimited
|
||||
* OutlookMsg 0 ms
|
||||
* 1) msg html [125] unlimited
|
||||
* 2) msg txt [125] unlimited
|
||||
* 3) msg xhtml [125] unlimited
|
||||
* 4) msg xml [125] unlimited
|
||||
* PdfBox 0 ms
|
||||
* 1) pdf html [110] unlimited
|
||||
* 2) pdf txt [50] 25 MB
|
||||
* 3) pdf xhtml [110] unlimited
|
||||
* 4) pdf xml [110] unlimited
|
||||
* Office 0 ms
|
||||
* 1) doc html [130] unlimited
|
||||
* 2) doc txt [130] unlimited
|
||||
* 3) doc xhtml [130] unlimited
|
||||
* 4) doc xml [130] unlimited
|
||||
* 5) mpp html [130] unlimited
|
||||
* 6) mpp txt [130] unlimited
|
||||
* 7) mpp xhtml [130] unlimited
|
||||
* 8) mpp xml [130] unlimited
|
||||
* 9) msg html [130] unlimited
|
||||
* 10) msg txt [130] unlimited
|
||||
* 11) msg xhtml [130] unlimited
|
||||
* 12) msg xml [130] unlimited
|
||||
* 13) ppt html [130] unlimited
|
||||
* 14) ppt txt [130] unlimited
|
||||
* 15) ppt xhtml [130] unlimited
|
||||
* 16) ppt xml [130] unlimited
|
||||
* 17) vsd html [130] unlimited
|
||||
* 18) vsd txt [130] unlimited
|
||||
* 19) vsd xhtml [130] unlimited
|
||||
* 20) vsd xml [130] unlimited
|
||||
* Poi 0 ms
|
||||
* 1) xls csv [130] unlimited
|
||||
* 2) xls html [130] unlimited
|
||||
* 3) xls txt [130] unlimited
|
||||
* 4) xls xhtml [130] unlimited
|
||||
* 5) xls xml [130] unlimited
|
||||
* 6) xlsx csv [130] unlimited
|
||||
* 7) xlsx html [130] unlimited
|
||||
* 8) xlsx txt [130] unlimited
|
||||
* 9) xlsx xhtml [130] unlimited
|
||||
* 10) xlsx xml [130] unlimited
|
||||
* OOXML 0 ms
|
||||
* 1) docm html [130] unlimited
|
||||
* 2) docm txt [130] unlimited
|
||||
* 3) docm xhtml [130] unlimited
|
||||
* 4) docm xml [130] unlimited
|
||||
* 5) docx html [130] unlimited
|
||||
* 6) docx txt [130] unlimited
|
||||
* 7) docx xhtml [130] unlimited
|
||||
* 8) docx xml [130] unlimited
|
||||
* 9) dotm html [130] unlimited
|
||||
* 10) dotm txt [130] unlimited
|
||||
* 11) dotm xhtml [130] unlimited
|
||||
* 12) dotm xml [130] unlimited
|
||||
* 13) dotx html [130] unlimited
|
||||
* 14) dotx txt [130] unlimited
|
||||
* 15) dotx xhtml [130] unlimited
|
||||
* 16) dotx xml [130] unlimited
|
||||
* 17) potm html [130] unlimited
|
||||
* 18) potm txt [130] unlimited
|
||||
* 19) potm xhtml [130] unlimited
|
||||
* 20) potm xml [130] unlimited
|
||||
* 21) potx html [130] unlimited
|
||||
* 22) potx txt [130] unlimited
|
||||
* 23) potx xhtml [130] unlimited
|
||||
* 24) potx xml [130] unlimited
|
||||
* 25) ppam html [130] unlimited
|
||||
* 26) ppam txt [130] unlimited
|
||||
* 27) ppam xhtml [130] unlimited
|
||||
* 28) ppam xml [130] unlimited
|
||||
* 29) ppsm html [130] unlimited
|
||||
* 30) ppsm txt [130] unlimited
|
||||
* 31) ppsm xhtml [130] unlimited
|
||||
* 32) ppsm xml [130] unlimited
|
||||
* 33) ppsx html [130] unlimited
|
||||
* 34) ppsx txt [130] unlimited
|
||||
* 35) ppsx xhtml [130] unlimited
|
||||
* 36) ppsx xml [130] unlimited
|
||||
* 37) pptm html [130] unlimited
|
||||
* 38) pptm txt [130] unlimited
|
||||
* 39) pptm xhtml [130] unlimited
|
||||
* 40) pptm xml [130] unlimited
|
||||
* 41) pptx html [130] unlimited
|
||||
* 42) pptx txt [130] unlimited
|
||||
* 43) pptx xhtml [130] unlimited
|
||||
* 44) pptx xml [130] unlimited
|
||||
* 45) sldm html [130] unlimited
|
||||
* 46) sldm txt [130] unlimited
|
||||
* 47) sldm xhtml [130] unlimited
|
||||
* 48) sldm xml [130] unlimited
|
||||
* 49) sldx html [130] unlimited
|
||||
* 50) sldx txt [130] unlimited
|
||||
* 51) sldx xhtml [130] unlimited
|
||||
* 52) sldx xml [130] unlimited
|
||||
* 53) xlam html [130] unlimited
|
||||
* 54) xlam txt [130] unlimited
|
||||
* 55) xlam xhtml [130] unlimited
|
||||
* 56) xlam xml [130] unlimited
|
||||
* 57) xlsb html [130] unlimited
|
||||
* 58) xlsb txt [130] unlimited
|
||||
* 59) xlsb xhtml [130] unlimited
|
||||
* 60) xlsb xml [130] unlimited
|
||||
* 61) xlsm html [130] unlimited
|
||||
* 62) xlsm txt [130] unlimited
|
||||
* 63) xlsm xhtml [130] unlimited
|
||||
* 64) xlsm xml [130] unlimited
|
||||
* 65) xlsx html [130] unlimited
|
||||
* 66) xlsx txt [130] unlimited
|
||||
* 67) xlsx xhtml [130] unlimited
|
||||
* 68) xlsx xml [130] unlimited
|
||||
* 69) xltm html [130] unlimited
|
||||
* 70) xltm txt [130] unlimited
|
||||
* 71) xltm xhtml [130] unlimited
|
||||
* 72) xltm xml [130] unlimited
|
||||
* 73) xltx html [130] unlimited
|
||||
* 74) xltx txt [130] unlimited
|
||||
* 75) xltx xhtml [130] unlimited
|
||||
* 76) xltx xml [130] unlimited
|
||||
* TikaAuto 0 ms
|
||||
* 1) cdf html [120] unlimited
|
||||
* 2) cdf txt [120] unlimited
|
||||
* 3) cdf xhtml [120] unlimited
|
||||
* 4) cdf xml [120] unlimited
|
||||
* 5) cpio html [120] unlimited
|
||||
* 6) cpio txt [120] unlimited
|
||||
* 7) cpio xhtml [120] unlimited
|
||||
* 8) cpio xml [120] unlimited
|
||||
* 9) doc html [120] unlimited
|
||||
* 10) doc txt [120] unlimited
|
||||
* 11) doc xhtml [120] unlimited
|
||||
* 12) doc xml [120] unlimited
|
||||
* 13) docm html [120] unlimited
|
||||
* 14) docm txt [120] unlimited
|
||||
* 15) docm xhtml [120] unlimited
|
||||
* 16) docm xml [120] unlimited
|
||||
* 17) docx html [120] unlimited
|
||||
* 18) docx txt [120] unlimited
|
||||
* 19) docx xhtml [120] unlimited
|
||||
* 20) docx xml [120] unlimited
|
||||
* 21) dotm html [120] unlimited
|
||||
* 22) dotm txt [120] unlimited
|
||||
* 23) dotm xhtml [120] unlimited
|
||||
* 24) dotm xml [120] unlimited
|
||||
* 25) dotx html [120] unlimited
|
||||
* 26) dotx txt [120] unlimited
|
||||
* 27) dotx xhtml [120] unlimited
|
||||
* 28) dotx xml [120] unlimited
|
||||
* 29) gzip html [120] unlimited
|
||||
* 30) gzip txt [120] unlimited
|
||||
* 31) gzip xhtml [120] unlimited
|
||||
* 32) gzip xml [120] unlimited
|
||||
* 33) hdf html [120] unlimited
|
||||
* 34) hdf txt [120] unlimited
|
||||
* 35) hdf xhtml [120] unlimited
|
||||
* 36) hdf xml [120] unlimited
|
||||
* 37) html html [120] unlimited
|
||||
* 38) html txt [120] unlimited
|
||||
* 39) html xhtml [120] unlimited
|
||||
* 40) html xml [120] unlimited
|
||||
* 41) jar html [120] unlimited
|
||||
* 42) jar txt [120] unlimited
|
||||
* 43) jar xhtml [120] unlimited
|
||||
* 44) jar xml [120] unlimited
|
||||
* 45) java html [120] unlimited
|
||||
* 46) java txt [120] unlimited
|
||||
* 47) java xhtml [120] unlimited
|
||||
* 48) java xml [120] unlimited
|
||||
* 49) key html [120] unlimited
|
||||
* 50) key txt [120] unlimited
|
||||
* 51) key xhtml [120] unlimited
|
||||
* 52) key xml [120] unlimited
|
||||
* 53) mpp html [120] unlimited
|
||||
* 54) mpp txt [120] unlimited
|
||||
* 55) mpp xhtml [120] unlimited
|
||||
* 56) mpp xml [120] unlimited
|
||||
* 57) numbers html [120] unlimited
|
||||
* 58) numbers txt [120] unlimited
|
||||
* 59) numbers xhtml [120] unlimited
|
||||
* 60) numbers xml [120] unlimited
|
||||
* 61) odc html [120] unlimited
|
||||
* 62) odc txt [120] unlimited
|
||||
* 63) odc xhtml [120] unlimited
|
||||
* 64) odc xml [120] unlimited
|
||||
* 65) odi html [120] unlimited
|
||||
* 66) odi txt [120] unlimited
|
||||
* 67) odi xhtml [120] unlimited
|
||||
* 68) odi xml [120] unlimited
|
||||
* 69) odm html [120] unlimited
|
||||
* 70) odm txt [120] unlimited
|
||||
* 71) odm xhtml [120] unlimited
|
||||
* 72) odm xml [120] unlimited
|
||||
* 73) odp html [120] unlimited
|
||||
* 74) odp txt [120] unlimited
|
||||
* 75) odp xhtml [120] unlimited
|
||||
* 76) odp xml [120] unlimited
|
||||
* 77) ods html [120] unlimited
|
||||
* 78) ods txt [120] unlimited
|
||||
* 79) ods xhtml [120] unlimited
|
||||
* 80) ods xml [120] unlimited
|
||||
* 81) odt html [120] unlimited
|
||||
* 82) odt txt [120] unlimited
|
||||
* 83) odt xhtml [120] unlimited
|
||||
* 84) odt xml [120] unlimited
|
||||
* 85) ogx html [120] unlimited
|
||||
* 86) ogx txt [120] unlimited
|
||||
* 87) ogx xhtml [120] unlimited
|
||||
* 88) ogx xml [120] unlimited
|
||||
* 89) oth html [120] unlimited
|
||||
* 90) oth txt [120] unlimited
|
||||
* 91) oth xhtml [120] unlimited
|
||||
* 92) oth xml [120] unlimited
|
||||
* 93) otp html [120] unlimited
|
||||
* 94) otp txt [120] unlimited
|
||||
* 95) otp xhtml [120] unlimited
|
||||
* 96) otp xml [120] unlimited
|
||||
* 97) ots html [120] unlimited
|
||||
* 98) ots txt [120] unlimited
|
||||
* 99) ots xhtml [120] unlimited
|
||||
* 100) ots xml [120] unlimited
|
||||
* 101) ott html [120] unlimited
|
||||
* 102) ott txt [120] unlimited
|
||||
* 103) ott xhtml [120] unlimited
|
||||
* 104) ott xml [120] unlimited
|
||||
* 105) pages html [120] unlimited
|
||||
* 106) pages txt [120] unlimited
|
||||
* 107) pages xhtml [120] unlimited
|
||||
* 108) pages xml [120] unlimited
|
||||
* 109) pdf html [120] unlimited
|
||||
* 110) pdf txt [120] 25 MB
|
||||
* 111) pdf xhtml [120] unlimited
|
||||
* 112) pdf xml [120] unlimited
|
||||
* 113) potm html [120] unlimited
|
||||
* 114) potm txt [120] unlimited
|
||||
* 115) potm xhtml [120] unlimited
|
||||
* 116) potm xml [120] unlimited
|
||||
* 117) potx html [120] unlimited
|
||||
* 118) potx txt [120] unlimited
|
||||
* 119) potx xhtml [120] unlimited
|
||||
* 120) potx xml [120] unlimited
|
||||
* 121) ppam html [120] unlimited
|
||||
* 122) ppam txt [120] unlimited
|
||||
* 123) ppam xhtml [120] unlimited
|
||||
* 124) ppam xml [120] unlimited
|
||||
* 125) ppsm html [120] unlimited
|
||||
* 126) ppsm txt [120] unlimited
|
||||
* 127) ppsm xhtml [120] unlimited
|
||||
* 128) ppsm xml [120] unlimited
|
||||
* 129) ppsx html [120] unlimited
|
||||
* 130) ppsx txt [120] unlimited
|
||||
* 131) ppsx xhtml [120] unlimited
|
||||
* 132) ppsx xml [120] unlimited
|
||||
* 133) ppt html [120] unlimited
|
||||
* 134) ppt txt [120] unlimited
|
||||
* 135) ppt xhtml [120] unlimited
|
||||
* 136) ppt xml [120] unlimited
|
||||
* 137) pptm html [120] unlimited
|
||||
* 138) pptm txt [120] unlimited
|
||||
* 139) pptm xhtml [120] unlimited
|
||||
* 140) pptm xml [120] unlimited
|
||||
* 141) pptx html [120] unlimited
|
||||
* 142) pptx txt [120] unlimited
|
||||
* 143) pptx xhtml [120] unlimited
|
||||
* 144) pptx xml [120] unlimited
|
||||
* 145) rar html [120] unlimited
|
||||
* 146) rar txt [120] unlimited
|
||||
* 147) rar xhtml [120] unlimited
|
||||
* 148) rar xml [120] unlimited
|
||||
* 149) rss html [120] unlimited
|
||||
* 150) rss txt [120] unlimited
|
||||
* 151) rss xhtml [120] unlimited
|
||||
* 152) rss xml [120] unlimited
|
||||
* 153) rtf html [120] unlimited
|
||||
* 154) rtf txt [120] unlimited
|
||||
* 155) rtf xhtml [120] unlimited
|
||||
* 156) rtf xml [120] unlimited
|
||||
* 157) sldm html [120] unlimited
|
||||
* 158) sldm txt [120] unlimited
|
||||
* 159) sldm xhtml [120] unlimited
|
||||
* 160) sldm xml [120] unlimited
|
||||
* 161) sldx html [120] unlimited
|
||||
* 162) sldx txt [120] unlimited
|
||||
* 163) sldx xhtml [120] unlimited
|
||||
* 164) sldx xml [120] unlimited
|
||||
* 165) sxw html [120] unlimited
|
||||
* 166) sxw txt [120] unlimited
|
||||
* 167) sxw xhtml [120] unlimited
|
||||
* 168) sxw xml [120] unlimited
|
||||
* 169) txt html [120] unlimited
|
||||
* 170) txt txt [120] unlimited
|
||||
* 171) txt xhtml [120] unlimited
|
||||
* 172) txt xml [120] unlimited
|
||||
* 173) vsd html [120] unlimited
|
||||
* 174) vsd txt [120] unlimited
|
||||
* 175) vsd xhtml [120] unlimited
|
||||
* 176) vsd xml [120] unlimited
|
||||
* 177) xhtml html [120] unlimited
|
||||
* 178) xhtml txt [120] unlimited
|
||||
* 179) xhtml xhtml [120] unlimited
|
||||
* 180) xhtml xml [120] unlimited
|
||||
* 181) xlam html [120] unlimited
|
||||
* 182) xlam txt [120] unlimited
|
||||
* 183) xlam xhtml [120] unlimited
|
||||
* 184) xlam xml [120] unlimited
|
||||
* 185) xls html [120] unlimited
|
||||
* 186) xls txt [120] unlimited
|
||||
* 187) xls xhtml [120] unlimited
|
||||
* 188) xls xml [120] unlimited
|
||||
* 189) xlsb html [120] unlimited
|
||||
* 190) xlsb txt [120] unlimited
|
||||
* 191) xlsb xhtml [120] unlimited
|
||||
* 192) xlsb xml [120] unlimited
|
||||
* 193) xlsm html [120] unlimited
|
||||
* 194) xlsm txt [120] unlimited
|
||||
* 195) xlsm xhtml [120] unlimited
|
||||
* 196) xlsm xml [120] unlimited
|
||||
* 197) xlsx html [120] unlimited
|
||||
* 198) xlsx txt [120] unlimited
|
||||
* 199) xlsx xhtml [120] unlimited
|
||||
* 200) xlsx xml [120] unlimited
|
||||
* 201) xltm html [120] unlimited
|
||||
* 202) xltm txt [120] unlimited
|
||||
* 203) xltm xhtml [120] unlimited
|
||||
* 204) xltm xml [120] unlimited
|
||||
* 205) xltx html [120] unlimited
|
||||
* 206) xltx txt [120] unlimited
|
||||
* 207) xltx xhtml [120] unlimited
|
||||
* 208) xltx xml [120] unlimited
|
||||
* 209) xml html [120] unlimited
|
||||
* 210) xml txt [120] unlimited
|
||||
* 211) xml xhtml [120] unlimited
|
||||
* 212) xml xml [120] unlimited
|
||||
* 213) z html [120] unlimited
|
||||
* 214) z txt [120] unlimited
|
||||
* 215) z xhtml [120] unlimited
|
||||
* 216) z xml [120] unlimited
|
||||
* TextMining 0 ms
|
||||
* 1) doc html [130] unlimited
|
||||
* 2) doc txt [50] unlimited
|
||||
* 3) doc xhtml [130] unlimited
|
||||
* 4) doc xml [130] unlimited
|
||||
* </pre>
|
||||
*/
|
||||
public class Tika
|
||||
{
|
||||
public static final String ARCHIVE = "Archive";
|
||||
public static final String OUTLOOK_MSG = "OutlookMsg";
|
||||
public static final String PDF_BOX = "PdfBox";
|
||||
public static final String POI_OFFICE = "Office";
|
||||
public static final String POI = "Poi";
|
||||
public static final String POI_OO_XML = "OOXML";
|
||||
public static final String TIKA_AUTO = "TikaAuto";
|
||||
public static final String TEXT_MINING = "TextMining";
|
||||
|
||||
public static final List<String> TRANSFORM_NAMES = Arrays.asList(
|
||||
ARCHIVE, OUTLOOK_MSG, PDF_BOX, POI_OFFICE, POI, POI_OO_XML, TIKA_AUTO, TEXT_MINING);
|
||||
|
||||
public static final String TARGET_MIMETYPE = "--targetMimetype=";
|
||||
public static final String TARGET_ENCODING = "--targetEncoding=";
|
||||
public static final String INCLUDE_CONTENTS = "--includeContents";
|
||||
|
||||
public static final String CSV = "csv";
|
||||
public static final String DOC = "doc";
|
||||
public static final String DOCX = "docx";
|
||||
public static final String HTML = "html";
|
||||
public static final String MSG = "msg";
|
||||
public static final String PDF = "pdf";
|
||||
public static final String PPTX = "pptx";
|
||||
public static final String TXT = "txt";
|
||||
public static final String XHTML = "xhtml";
|
||||
public static final String XSLX = "xslx";
|
||||
public static final String XML = "xml";
|
||||
public static final String ZIP = "zip";
|
||||
|
||||
private Parser packageParser = new PackageParser();
|
||||
private Parser pdfParser = new PDFParser();
|
||||
private Parser officeParser = new OfficeParser();
|
||||
private Parser autoDetectParser;
|
||||
private Parser ooXmlParser = new OOXMLParser();
|
||||
private Parser tikaOfficeDetectParser = new TikaOfficeDetectParser();
|
||||
|
||||
private DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector()
|
||||
{
|
||||
private List<String> disabledMediaTypes = Arrays.asList(new String[] {MIMETYPE_IMAGE_JPEG, MIMETYPE_IMAGE_TIFF, MIMETYPE_IMAGE_PNG});
|
||||
|
||||
@Override
|
||||
public boolean select(Metadata metadata)
|
||||
{
|
||||
String contentType = metadata.get(Metadata.CONTENT_TYPE);
|
||||
if (contentType == null || contentType.equals("") || disabledMediaTypes == null)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return !disabledMediaTypes.contains(contentType);
|
||||
}
|
||||
};
|
||||
|
||||
public Tika() throws TikaException, IOException, SAXException
|
||||
{
|
||||
ClassLoader classLoader = getClass().getClassLoader();
|
||||
URL tikaConfigXml = classLoader.getResource("tika-config.xml");
|
||||
TikaConfig tikaConfig = new TikaConfig(tikaConfigXml);
|
||||
autoDetectParser = new AutoDetectParser(tikaConfig);
|
||||
}
|
||||
|
||||
// Method included for developer testing
|
||||
public static void main(String[] args)
|
||||
{
|
||||
long start = System.currentTimeMillis();
|
||||
try
|
||||
{
|
||||
new Tika().transform(args);
|
||||
}
|
||||
catch (IllegalArgumentException e)
|
||||
{
|
||||
System.err.println("ERROR "+e.getMessage());
|
||||
System.exit(-1);
|
||||
}
|
||||
catch (IllegalStateException | TikaException | IOException | SAXException e)
|
||||
{
|
||||
System.err.println("ERROR "+e.getMessage());
|
||||
e.printStackTrace();
|
||||
System.exit(-2);
|
||||
}
|
||||
System.out.println("Finished in "+(System.currentTimeMillis()-start)+"ms");
|
||||
}
|
||||
|
||||
// Extracts parameters form args
|
||||
public void transform(String[] args)
|
||||
{
|
||||
String transform = null;
|
||||
String targetMimetype = null;
|
||||
String targetEncoding = null;
|
||||
String sourceFilename = null;
|
||||
String targetFilename = null;
|
||||
Boolean includeContents = null;
|
||||
|
||||
for (String arg: args)
|
||||
{
|
||||
if (arg.startsWith("--"))
|
||||
{
|
||||
if (INCLUDE_CONTENTS.startsWith(arg))
|
||||
{
|
||||
getValue(arg, false, includeContents, INCLUDE_CONTENTS);
|
||||
includeContents = true;
|
||||
}
|
||||
else if (arg.startsWith(TARGET_ENCODING))
|
||||
{
|
||||
targetEncoding = getValue(arg, true, targetEncoding, TARGET_ENCODING);
|
||||
}
|
||||
else if (arg.startsWith(TARGET_MIMETYPE))
|
||||
{
|
||||
targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected argument "+arg);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (transform == null)
|
||||
{
|
||||
transform = arg;
|
||||
}
|
||||
else if (sourceFilename == null)
|
||||
{
|
||||
sourceFilename = arg;
|
||||
}
|
||||
else if (targetFilename == null)
|
||||
{
|
||||
targetFilename = arg;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected argument "+arg);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (targetFilename == null)
|
||||
{
|
||||
throw new IllegalArgumentException("Missing arguments");
|
||||
}
|
||||
includeContents = includeContents == null ? false : includeContents;
|
||||
|
||||
transform(transform, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding);
|
||||
}
|
||||
|
||||
private String getValue(String arg, boolean valueExpected, Object value, String optionName)
|
||||
{
|
||||
if (value != null)
|
||||
{
|
||||
throw new IllegalArgumentException("Duplicate "+optionName);
|
||||
}
|
||||
String stringValue = arg.substring(optionName.length()).trim();
|
||||
if (!valueExpected && stringValue.length() > 0)
|
||||
{
|
||||
throw new IllegalArgumentException("Unexpected value with "+optionName);
|
||||
}
|
||||
if (valueExpected && stringValue.length() == 0)
|
||||
{
|
||||
throw new IllegalArgumentException("Expected value with "+optionName);
|
||||
}
|
||||
return stringValue;
|
||||
}
|
||||
|
||||
// Adds transform specific values such as parser and documentSelector.
|
||||
private void transform(String transform, Boolean includeContents,
|
||||
String sourceFilename,
|
||||
String targetFilename, String targetMimetype, String targetEncoding)
|
||||
{
|
||||
Parser parser = null;
|
||||
DocumentSelector documentSelector = null;
|
||||
|
||||
switch(transform)
|
||||
{
|
||||
case ARCHIVE:
|
||||
parser = packageParser;
|
||||
break;
|
||||
case OUTLOOK_MSG:
|
||||
case POI_OFFICE:
|
||||
case TEXT_MINING:
|
||||
parser = officeParser;
|
||||
break;
|
||||
case PDF_BOX:
|
||||
parser = pdfParser;
|
||||
documentSelector = pdfBoxEmbededDocumentSelector;
|
||||
break;
|
||||
case POI:
|
||||
parser = tikaOfficeDetectParser;
|
||||
break;
|
||||
case POI_OO_XML:
|
||||
parser = ooXmlParser;
|
||||
break;
|
||||
case TIKA_AUTO:
|
||||
parser = autoDetectParser;
|
||||
break;
|
||||
}
|
||||
|
||||
transform(parser, documentSelector, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding);
|
||||
}
|
||||
|
||||
|
||||
private void transform(Parser parser, DocumentSelector documentSelector, Boolean includeContents,
|
||||
String sourceFilename,
|
||||
String targetFilename, String targetMimetype, String targetEncoding)
|
||||
{
|
||||
InputStream is = null;
|
||||
OutputStream os = null;
|
||||
Writer ow = null;
|
||||
|
||||
try
|
||||
{
|
||||
is = new BufferedInputStream(new FileInputStream(sourceFilename));
|
||||
os = new FileOutputStream(targetFilename);
|
||||
ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding));
|
||||
Metadata metadata = new Metadata();
|
||||
ParseContext context = buildParseContext(documentSelector, includeContents);
|
||||
ContentHandler handler = getContentHandler(targetMimetype, ow);
|
||||
|
||||
parser.parse(is, handler, metadata, context);
|
||||
}
|
||||
catch (SAXException | TikaException | IOException e)
|
||||
{
|
||||
throw new IllegalStateException(e.getMessage(), e);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (is != null)
|
||||
{
|
||||
try { is.close(); } catch (Throwable e) {}
|
||||
}
|
||||
if (os != null)
|
||||
{
|
||||
try { os.close(); } catch (Throwable e) {}
|
||||
}
|
||||
if (ow != null)
|
||||
{
|
||||
try { ow.close(); } catch (Throwable e) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected ContentHandler getContentHandler(String targetMimetype, Writer output)
|
||||
{
|
||||
try
|
||||
{
|
||||
ContentHandler handler;
|
||||
if (MIMETYPE_TEXT_PLAIN.equals(targetMimetype))
|
||||
{
|
||||
handler = new BodyContentHandler(output);
|
||||
}
|
||||
else
|
||||
{
|
||||
SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
|
||||
TransformerHandler transformerHandler = null;
|
||||
transformerHandler = factory.newTransformerHandler();
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
transformerHandler.setResult(new StreamResult(output));
|
||||
handler = transformerHandler;
|
||||
|
||||
if (MIMETYPE_HTML.equals(targetMimetype))
|
||||
{
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, HTML);
|
||||
return new ExpandedTitleContentHandler(transformerHandler);
|
||||
}
|
||||
else if (MIMETYPE_XHTML.equals(targetMimetype) ||
|
||||
MIMETYPE_XML.equals(targetMimetype))
|
||||
{
|
||||
transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, XML);
|
||||
}
|
||||
else if (MIMETYPE_TEXT_CSV.equals(targetMimetype))
|
||||
{
|
||||
handler = new CsvContentHandler(output);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new IllegalArgumentException("Invalid target mimetype " + targetMimetype);
|
||||
}
|
||||
}
|
||||
return handler;
|
||||
}
|
||||
catch (TransformerConfigurationException e)
|
||||
{
|
||||
throw new IllegalStateException(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A wrapper around the normal Tika BodyContentHandler for CSV rather encoding than tab separated.
|
||||
*/
|
||||
protected static class CsvContentHandler extends BodyContentHandler {
|
||||
private static final char[] comma = new char[]{ ',' };
|
||||
private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+");
|
||||
|
||||
private boolean inCell = false;
|
||||
private boolean needsComma = false;
|
||||
|
||||
protected CsvContentHandler(Writer output) {
|
||||
super(output);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void ignorableWhitespace(char[] ch, int start, int length)
|
||||
throws SAXException {
|
||||
if(length == 1 && ch[0] == '\t') {
|
||||
// Ignore tabs, as they mess up the CSV output
|
||||
} else {
|
||||
super.ignorableWhitespace(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length)
|
||||
throws SAXException {
|
||||
if(inCell) {
|
||||
StringBuffer t = new StringBuffer(new String(ch,start,length));
|
||||
|
||||
// Quote if not all numbers
|
||||
if(all_nums.matcher(t).matches())
|
||||
{
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int i=t.length()-1; i>=0; i--) {
|
||||
if(t.charAt(i) == '\"') {
|
||||
// Double up double quotes
|
||||
t.insert(i, '\"');
|
||||
i--;
|
||||
}
|
||||
}
|
||||
t.insert(0, '\"');
|
||||
t.append('\"');
|
||||
char[] c = t.toString().toCharArray();
|
||||
super.characters(c, 0, c.length);
|
||||
}
|
||||
} else {
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String name,
|
||||
Attributes atts) throws SAXException {
|
||||
if(localName.equals("td")) {
|
||||
inCell = true;
|
||||
if(needsComma) {
|
||||
super.characters(comma, 0, 1);
|
||||
needsComma = true;
|
||||
}
|
||||
} else {
|
||||
super.startElement(uri, localName, name, atts);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String name)
|
||||
throws SAXException {
|
||||
if(localName.equals("td")) {
|
||||
needsComma = true;
|
||||
inCell = false;
|
||||
} else {
|
||||
if(localName.equals("tr")) {
|
||||
needsComma = false;
|
||||
}
|
||||
super.endElement(uri, localName, name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents)
|
||||
{
|
||||
ParseContext context = new ParseContext();
|
||||
if (documentSelector != null)
|
||||
{
|
||||
context.set(DocumentSelector.class, documentSelector);
|
||||
}
|
||||
|
||||
// pdfParserConfig is never set in the original repo code, so code removed here.
|
||||
|
||||
// If Archive transform
|
||||
if (includeContents != null)
|
||||
{
|
||||
context.set(Parser.class, includeContents ? autoDetectParser : new EmptyParser());
|
||||
}
|
||||
|
||||
return context;
|
||||
}
|
||||
}
|
@ -0,0 +1,137 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Enterprise Repository
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2018 Alfresco Software Limited
|
||||
* %%
|
||||
* License rights for this program may be obtained from Alfresco Software, Ltd.
|
||||
* pursuant to a written agreement and any use of this program without such an
|
||||
* agreement is prohibited.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.Resource;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.stereotype.Controller;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.alfresco.repo.content.MimetypeMap.MIMETYPE_TEXT_PLAIN;
|
||||
import static org.alfresco.transformer.Tika.*;
|
||||
|
||||
/**
|
||||
* Controller for the Docker based Tika transformers.
|
||||
*
|
||||
* Status Codes:
|
||||
*
|
||||
* 200 Success
|
||||
* 400 Bad Request: Invalid target mimetype <mimetype>
|
||||
* 400 Bad Request: Request parameter <name> is missing (missing mandatory parameter)
|
||||
* 400 Bad Request: Request parameter <name> is of the wrong type
|
||||
* 400 Bad Request: Transformer exit code was not 0 (possible problem with the source file)
|
||||
* 400 Bad Request: The source filename was not supplied
|
||||
* 500 Internal Server Error: (no message with low level IO problems)
|
||||
* 500 Internal Server Error: The target filename was not supplied (should not happen as targetExtension is checked)
|
||||
* 500 Internal Server Error: Transformer version check exit code was not 0
|
||||
* 500 Internal Server Error: Transformer version check failed to create any output
|
||||
* 500 Internal Server Error: Could not read the target file
|
||||
* 500 Internal Server Error: The target filename was malformed (should not happen because of other checks)
|
||||
* 500 Internal Server Error: Transformer failed to create an output file (the exit code was 0, so there should be some content)
|
||||
* 500 Internal Server Error: Filename encoding error
|
||||
* 507 Insufficient Storage: Failed to store the source file
|
||||
*/
|
||||
@Controller
|
||||
public class TikaController extends AbstractTransformerController
|
||||
{
|
||||
private Tika tika;
|
||||
|
||||
@Autowired
|
||||
public TikaController() throws TikaException, IOException, SAXException
|
||||
{
|
||||
logger = LogFactory.getLog(TikaController.class);
|
||||
logger.info("--------------------------------------------------------------------------------------------------------------------------------------------------------------");
|
||||
logEnterpriseLicenseMessage();
|
||||
logger.info("Tika is from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt");
|
||||
logger.info("--------------------------------------------------------------------------------------------------------------------------------------------------------------");
|
||||
|
||||
tika = new Tika();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getTransformerName()
|
||||
{
|
||||
return "Tika";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void callTransform(String... args)
|
||||
{
|
||||
tika.transform(args);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String version()
|
||||
{
|
||||
return "Tika available";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ProbeTestTransform getProbeTestTransform()
|
||||
{
|
||||
// See the Javadoc on this method and Probes.md for the choice of these values.
|
||||
// the livenessPercentage is a little large as Tika does tend to suffer from slow transforms that class with a gc.
|
||||
return new ProbeTestTransform(this, "quick.pdf", "quick.txt",
|
||||
60, 16, 400, 10240, 60*30+1, 60*15+20)
|
||||
{
|
||||
@Override
|
||||
protected void executeTransformCommand(File sourceFile, File targetFile)
|
||||
{
|
||||
TikaController.this.callTransform(sourceFile, targetFile, PDF_BOX,
|
||||
TARGET_MIMETYPE+MIMETYPE_TEXT_PLAIN, TARGET_ENCODING+"UTF-8");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@PostMapping("/transform")
|
||||
public ResponseEntity<Resource> transform(HttpServletRequest request,
|
||||
@RequestParam("file") MultipartFile sourceMultipartFile,
|
||||
@RequestParam("targetExtension") String targetExtension,
|
||||
@RequestParam("targetMimetype") String targetMimetype,
|
||||
@RequestParam("targetEncoding") String targetEncoding,
|
||||
|
||||
@RequestParam(value = "timeout", required = false) Long timeout,
|
||||
@RequestParam(value = "testDelay", required = false) Long testDelay,
|
||||
|
||||
@RequestParam(value = "transform") String transform,
|
||||
@RequestParam(value="includeContents", required = false) Boolean includeContents)
|
||||
{
|
||||
if (!TRANSFORM_NAMES.contains(transform))
|
||||
{
|
||||
throw new TransformException(400, "Invalid transform value");
|
||||
}
|
||||
|
||||
String targetFilename = createTargetFileName(sourceMultipartFile, targetExtension);
|
||||
File sourceFile = createSourceFile(request, sourceMultipartFile);
|
||||
File targetFile = createTargetFile(request, targetFilename);
|
||||
// Both files are deleted by TransformInterceptor.afterCompletion
|
||||
|
||||
// TODO Consider streaming the request and response rather than using temporary files
|
||||
// https://www.logicbig.com/tutorials/spring-framework/spring-web-mvc/streaming-response-body.html
|
||||
|
||||
callTransform(sourceFile, targetFile, transform,
|
||||
includeContents != null && includeContents ? INCLUDE_CONTENTS : null,
|
||||
TARGET_MIMETYPE+targetMimetype, TARGET_ENCODING+targetEncoding);
|
||||
|
||||
return createAttachment(targetFilename, targetFile, testDelay);
|
||||
}
|
||||
}
|
@ -0,0 +1,117 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Repository
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2016 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.PushbackInputStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.poi.poifs.common.POIFSConstants;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.microsoft.OfficeParser;
|
||||
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
///////// THIS FILE IS A COPY OF THE CODE IN alfresco-repository /////////////
|
||||
|
||||
/**
|
||||
* <a href="http://tika.apache.org/Apache Tika">Apache Tika</a> assumes that
|
||||
* you either know exactly what your content is, or that
|
||||
* you'll leave it to auto-detection.
|
||||
* Within Alfresco, we usually do know. However, from time
|
||||
* to time, we don't know if we have one of the old or one
|
||||
* of the new office files (eg .xls and .xlsx).
|
||||
* This class allows automatically selects the appropriate
|
||||
* old (OLE2) or new (OOXML) Tika parser as required.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public class TikaOfficeDetectParser implements Parser {
|
||||
private Parser ole2Parser = new OfficeParser();
|
||||
private Parser ooxmlParser = new OOXMLParser();
|
||||
|
||||
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
|
||||
Set<MediaType> types = new HashSet<MediaType>();
|
||||
types.addAll(ole2Parser.getSupportedTypes(parseContext));
|
||||
types.addAll(ooxmlParser.getSupportedTypes(parseContext));
|
||||
return types;
|
||||
}
|
||||
|
||||
public void parse(InputStream stream,
|
||||
ContentHandler handler, Metadata metadata,
|
||||
ParseContext parseContext) throws IOException, SAXException,
|
||||
TikaException
|
||||
{
|
||||
byte[] initial4 = new byte[4];
|
||||
InputStream wrapped;
|
||||
// Preserve TikaInputStreams as TikaInputStreams as they require less memory to process
|
||||
if (stream.markSupported())
|
||||
{
|
||||
stream.mark(initial4.length);
|
||||
IOUtils.readFully(stream, initial4);
|
||||
stream.reset();
|
||||
wrapped = stream;
|
||||
}
|
||||
else
|
||||
{
|
||||
PushbackInputStream inp = new PushbackInputStream(stream, 4);
|
||||
IOUtils.readFully(inp, initial4);
|
||||
inp.unread(initial4);
|
||||
wrapped = inp;
|
||||
}
|
||||
|
||||
// Which is it?
|
||||
if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
|
||||
initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
|
||||
initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
|
||||
initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3])
|
||||
{
|
||||
ooxmlParser.parse(wrapped, handler, metadata, parseContext);
|
||||
}
|
||||
else
|
||||
{
|
||||
ole2Parser.parse(wrapped, handler, metadata, parseContext);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated This method will be removed in Apache Tika 1.0.
|
||||
*/
|
||||
public void parse(InputStream stream,
|
||||
ContentHandler handler, Metadata metadata)
|
||||
throws IOException, SAXException, TikaException
|
||||
{
|
||||
parse(stream, handler, metadata, new ParseContext());
|
||||
}
|
||||
}
|
BIN
alfresco-docker-tika/src/main/resources/quick.pdf
Normal file
BIN
alfresco-docker-tika/src/main/resources/quick.pdf
Normal file
Binary file not shown.
@ -0,0 +1,39 @@
|
||||
<html xmlns:th="http://www.thymeleaf.org">
|
||||
<body>
|
||||
|
||||
<div>
|
||||
<h2>Tika Test Transformations</h2>
|
||||
<form method="POST" enctype="multipart/form-data" action="/transform">
|
||||
<table>
|
||||
<tr><td><div style="text-align:right">transform *</div></td><td><select name="transform">
|
||||
<option value="Archive">Archive</option>
|
||||
<option value="OutlookMsg">OutlookMsg</option>
|
||||
<option selected="selected" value="PdfBox">PdfBox</option>
|
||||
<option value="Office">Office</option>
|
||||
<option value="Poi">Poi</option>
|
||||
<option value="OOXML">OOXML</option>
|
||||
<option value="TikaAuto">TikaAuto</option>
|
||||
<option value="TextMining">TextMining</option>
|
||||
<option value="UNSET"></option>
|
||||
<option value="BADVALUE">BADVALUE</option>
|
||||
<option value="MIXED CASE TikaAuto">TikaAuto</option>
|
||||
</select></td></tr>
|
||||
<tr><td><div style="text-align:right">file *</div></td><td><input type="file" name="file" /></td></tr>
|
||||
<tr><td><div style="text-align:right">targetExtension *</div></td><td><input type="text" name="targetExtension" value="txt" /></td></tr>
|
||||
<tr><td><div style="text-align:right">targetMimetype *</div></td><td><input type="text" name="targetMimetype" value="text/plain" /></td></tr>
|
||||
<tr><td><div style="text-align:right">targetEncoding *</div></td><td><input type="text" name="targetEncoding" value="UTF-8" /></td></tr>
|
||||
<tr><td><div style="text-align:right">includeContents (archive) *</div></td><td><input type="checkbox" name="includeContents" value="true" /></td></tr>
|
||||
<tr><td><div style="text-align:right">timeout</div></td><td><input type="text" name="timeout" value="" /></td></tr>
|
||||
<tr><td><div style="text-align:right">testDelay</div></td><td><input type="text" name="testDelay" value="" /></td></tr>
|
||||
|
||||
<tr><td></td><td><input type="submit" value="Transform" /></td></tr>
|
||||
</table>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<a href="/log">Log entries</a>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
6
alfresco-docker-tika/src/main/resources/tika-config.xml
Normal file
6
alfresco-docker-tika/src/main/resources/tika-config.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<properties>
|
||||
<!-- This property, when set, will hide the start up warnings of tika for libraries are missing. -->
|
||||
<!-- See https://issues.apache.org/jira/browse/TIKA-2490 -->
|
||||
<service-loader initializableProblemHandler="ignore"/>
|
||||
</properties>
|
@ -0,0 +1,344 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Repository
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2018 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.springframework.boot.test.autoconfigure.web.servlet.WebMvcTest;
|
||||
import org.springframework.boot.test.mock.mockito.SpyBean;
|
||||
import org.springframework.mock.web.MockMultipartFile;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
import org.springframework.test.web.servlet.MvcResult;
|
||||
import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilder;
|
||||
|
||||
import static org.alfresco.repo.content.MimetypeMap.*;
|
||||
import static org.alfresco.transformer.Tika.*;
|
||||
import static org.springframework.test.util.AssertionErrors.assertTrue;
|
||||
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.header;
|
||||
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
|
||||
|
||||
/**
|
||||
* Test the TikaController without a server.
|
||||
* Super class includes tests for the AbstractTransformerController.
|
||||
*/
|
||||
@RunWith(SpringRunner.class)
|
||||
@WebMvcTest(TikaController.class)
|
||||
public class TikaControllerTest extends AbstractTransformerControllerTest
|
||||
{
|
||||
public static final String EXPECTED_XHTML_CONTENT_CONTAINS = "<p>The quick brown fox jumps over the lazy dog</p>";
|
||||
public static final String EXPECTED_TEXT_CONTENT_CONTAINS = "The quick brown fox jumps over the lazy dog";
|
||||
public static final String EXPECTED_MSG_CONTENT_CONTAINS = "Recipients\n" +
|
||||
"\tmark.rogers@alfresco.com; speedy@quick.com; mrquick@nowhere.com\n" +
|
||||
"\n" +
|
||||
"The quick brown fox jumps over the lazy dogs";
|
||||
public static final String EXPECTED_CSV_CONTENT_CONTAINS = "\"The\",\"quick\",\"brown\",\"fox\"";
|
||||
|
||||
@SpyBean
|
||||
private TikaController controller;
|
||||
|
||||
String transform = PDF_BOX;
|
||||
String targetEncoding = "UTF-8";
|
||||
String targetMimetype = MIMETYPE_TEXT_PLAIN;
|
||||
|
||||
private void transform(String transform, String sourceExtension, String targetExtension,
|
||||
String sourceMimetype, String targetMimetype,
|
||||
Boolean includeContents, String expectedContentContains) throws Exception
|
||||
{
|
||||
// We don't use targetFileBytes as some of the transforms contain different date text based on the os being used.
|
||||
super.mockTransformCommand(controller, sourceExtension, targetExtension, sourceMimetype, false);
|
||||
this.transform = transform;
|
||||
this.targetMimetype = targetMimetype;
|
||||
|
||||
System.out.println("Test "+transform+" "+ sourceExtension +" to "+targetExtension);
|
||||
MockHttpServletRequestBuilder requestBuilder = includeContents == null
|
||||
? mockMvcRequest("/transform", sourceFile, "targetExtension", this.targetExtension)
|
||||
: mockMvcRequest("/transform", sourceFile, "targetExtension", this.targetExtension, "includeContents", includeContents.toString());
|
||||
MvcResult result = mockMvc.perform(requestBuilder)
|
||||
.andExpect(status().is(200))
|
||||
.andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick." + this.targetExtension)).
|
||||
andReturn();
|
||||
String content = result.getResponse().getContentAsString();
|
||||
assertTrue("The content did not include \""+expectedContentContains, content.contains(expectedContentContains));
|
||||
}
|
||||
|
||||
@Override
|
||||
// Add extra required parameters to the request.
|
||||
protected MockHttpServletRequestBuilder mockMvcRequest(String url, MockMultipartFile sourceFile, String... params)
|
||||
{
|
||||
return super.mockMvcRequest(url, sourceFile, params)
|
||||
.param("transform", transform)
|
||||
.param("targetEncoding", targetEncoding)
|
||||
.param("targetMimetype", targetMimetype);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void simpleTransformTest() throws Exception
|
||||
{
|
||||
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.simpleTransformTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void testDelayTest() throws Exception
|
||||
{
|
||||
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.testDelayTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void badExitCodeTest() throws Exception
|
||||
{
|
||||
// Ignore the test in super class as the Tika transforms are real rather than mocked up.
|
||||
// It is the mock that returns a non zero exit code.
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void noTargetFileTest() throws Exception
|
||||
{
|
||||
// Ignore the test in super class as the Tika transforms are real rather than mocked up.
|
||||
// It is the mock that returns a zero length file for other transformers, when we supply an invalid targetExtension.
|
||||
}
|
||||
|
||||
// --- Super class tests (need modified setup) ---
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void dotDotSourceFilenameTest() throws Exception
|
||||
{
|
||||
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.dotDotSourceFilenameTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void noExtensionSourceFilenameTest() throws Exception
|
||||
{
|
||||
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.noExtensionSourceFilenameTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void badSourceFilenameTest() throws Exception
|
||||
{
|
||||
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.badSourceFilenameTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void blankSourceFilenameTest() throws Exception
|
||||
{
|
||||
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.blankSourceFilenameTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void noTargetExtensionTest() throws Exception
|
||||
{
|
||||
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.noTargetExtensionTest();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Override
|
||||
public void calculateMaxTime() throws Exception
|
||||
{
|
||||
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
|
||||
super.calculateMaxTime();
|
||||
}
|
||||
|
||||
// --- General Tika tests ---
|
||||
|
||||
@Test
|
||||
public void badEncodingTest() throws Exception
|
||||
{
|
||||
super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true);
|
||||
targetEncoding = "rubbish";
|
||||
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension))
|
||||
.andExpect(status().is(500));
|
||||
}
|
||||
|
||||
// --- Archive ---
|
||||
|
||||
@Test
|
||||
public void zipToTextArchiveTest() throws Exception
|
||||
{
|
||||
transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN,false,
|
||||
"quick.html\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.pdf\n" +
|
||||
"\n" +
|
||||
"\n");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void zipToTextIncludeArchiveTest() throws Exception
|
||||
{
|
||||
transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN,true,
|
||||
"quick.html\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"The quick brown fox jumps over the lazy dog\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.pdf\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"The quick brown fox jumps over the lazy dog" +
|
||||
"\n" +
|
||||
"\n");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void zipToTextExcludeArchiveTest() throws Exception
|
||||
{
|
||||
transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN,
|
||||
false, "\n" +
|
||||
"folder/subfolder/quick.jpg\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.doc\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.html\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.pdf\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.txt\n" +
|
||||
"\n" +
|
||||
"\n" +
|
||||
"quick.xml\n" +
|
||||
"\n");
|
||||
}
|
||||
|
||||
// --- OutlookMsg ---
|
||||
|
||||
@Test
|
||||
public void msgToTxtOutlookMsgTest() throws Exception
|
||||
{
|
||||
transform(OUTLOOK_MSG, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null, EXPECTED_MSG_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
// --- PdfBox ---
|
||||
|
||||
@Test
|
||||
public void pdfToTxtPdfBoxTest() throws Exception
|
||||
{
|
||||
transform(PDF_BOX, PDF, TXT, MIMETYPE_PDF, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pdfToCsvPdfBoxTest() throws Exception
|
||||
{
|
||||
transform(PDF_BOX, PDF, CSV, MIMETYPE_PDF, MIMETYPE_TEXT_CSV, null, EXPECTED_TEXT_CONTENT_CONTAINS); // Yes it is just text
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pdfToXmlPdfBoxTest() throws Exception
|
||||
{
|
||||
transform(PDF_BOX, PDF, XML, MIMETYPE_PDF, MIMETYPE_XML, null, EXPECTED_XHTML_CONTENT_CONTAINS); // Yes it is just XHTML
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pdfToXhtmlPdfBoxTest() throws Exception
|
||||
{
|
||||
transform(PDF_BOX, PDF, XHTML, MIMETYPE_PDF, MIMETYPE_XHTML, null, EXPECTED_XHTML_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pdfToHtmlPdfBoxTest() throws Exception
|
||||
{
|
||||
transform(PDF_BOX, PDF, HTML, MIMETYPE_PDF, MIMETYPE_HTML, null, EXPECTED_XHTML_CONTENT_CONTAINS); // Yes it is just XHTML
|
||||
}
|
||||
|
||||
// --- Office ---
|
||||
|
||||
@Test
|
||||
public void msgToTxtOfficeTest() throws Exception
|
||||
{
|
||||
transform(POI_OFFICE, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null, EXPECTED_MSG_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void docToTxtOfficeTest() throws Exception
|
||||
{
|
||||
transform(POI_OFFICE, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
// --- Poi ---
|
||||
|
||||
@Test
|
||||
public void xslxToCsvPoiTest() throws Exception
|
||||
{
|
||||
transform(POI, XSLX, CSV, MIMETYPE_OPENXML_SPREADSHEET, MIMETYPE_TEXT_CSV, null, EXPECTED_CSV_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
// --- OOXML ---
|
||||
|
||||
@Test
|
||||
public void docxToTxtOoXmlTest() throws Exception
|
||||
{
|
||||
transform(POI_OO_XML, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pptxToTxtOoXmlTest() throws Exception
|
||||
{
|
||||
transform(POI_OO_XML, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
// --- TikaAuto ---
|
||||
|
||||
@Test
|
||||
public void ppxtToTxtTikaAutoTest() throws Exception
|
||||
{
|
||||
transform(TIKA_AUTO, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void doctToTxtTikaAutoTest() throws Exception
|
||||
{
|
||||
transform(TIKA_AUTO, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
|
||||
// --- TextMining ---
|
||||
|
||||
@Test
|
||||
public void docToTxtTextMiningTest() throws Exception
|
||||
{
|
||||
transform(TEXT_MINING, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS);
|
||||
}
|
||||
}
|
@ -0,0 +1,51 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Repository
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2018 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
*
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import org.junit.runner.RunWith;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.context.SpringBootTest.WebEnvironment;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
/**
|
||||
* Tests TikaController with a server test harness.
|
||||
*/
|
||||
@RunWith(SpringRunner.class)
|
||||
@SpringBootTest(webEnvironment = WebEnvironment.RANDOM_PORT)
|
||||
public class TikaHttpRequestTest extends AbstractHttpRequestTest
|
||||
{
|
||||
@Override
|
||||
protected String getTransformerName()
|
||||
{
|
||||
return "Tika";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getSourceExtension()
|
||||
{
|
||||
return "pdf";
|
||||
};
|
||||
}
|
BIN
alfresco-docker-tika/src/test/resources/quick.doc
Normal file
BIN
alfresco-docker-tika/src/test/resources/quick.doc
Normal file
Binary file not shown.
BIN
alfresco-docker-tika/src/test/resources/quick.docx
Normal file
BIN
alfresco-docker-tika/src/test/resources/quick.docx
Normal file
Binary file not shown.
BIN
alfresco-docker-tika/src/test/resources/quick.msg
Normal file
BIN
alfresco-docker-tika/src/test/resources/quick.msg
Normal file
Binary file not shown.
BIN
alfresco-docker-tika/src/test/resources/quick.pptx
Normal file
BIN
alfresco-docker-tika/src/test/resources/quick.pptx
Normal file
Binary file not shown.
6
alfresco-docker-tika/src/test/resources/quick.txt
Normal file
6
alfresco-docker-tika/src/test/resources/quick.txt
Normal file
@ -0,0 +1,6 @@
|
||||
|
||||
The quick brown fox jumps over the lazy dog
|
||||
|
||||
|
||||
Blank Page
|
||||
|
BIN
alfresco-docker-tika/src/test/resources/quick.xslx
Normal file
BIN
alfresco-docker-tika/src/test/resources/quick.xslx
Normal file
Binary file not shown.
BIN
alfresco-docker-tika/src/test/resources/quick.zip
Normal file
BIN
alfresco-docker-tika/src/test/resources/quick.zip
Normal file
Binary file not shown.
@ -51,8 +51,7 @@ import java.io.UnsupportedEncodingException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>Abstract Controller, provides structure and helper methods to sub-class transformer controllers.</p>
|
||||
@ -376,6 +375,87 @@ public abstract class AbstractTransformerController
|
||||
}
|
||||
}
|
||||
|
||||
public void callTransform(File sourceFile, File targetFile, String... args) throws TransformException
|
||||
{
|
||||
args = buildArgs(sourceFile, targetFile, args);
|
||||
try
|
||||
{
|
||||
callTransform(args);
|
||||
}
|
||||
catch (IllegalArgumentException e)
|
||||
{
|
||||
throw new TransformException(400, getMessage(e));
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new TransformException(500, getMessage(e));
|
||||
}
|
||||
if (!targetFile.exists() || targetFile.length() == 0)
|
||||
{
|
||||
throw new TransformException(500, "Transformer failed to create an output file");
|
||||
}
|
||||
}
|
||||
|
||||
private String getMessage(Exception e)
|
||||
{
|
||||
return e.getMessage() == null ? e.getClass().getSimpleName(): e.getMessage();
|
||||
}
|
||||
|
||||
protected void callTransform(String[] args)
|
||||
{
|
||||
// Overridden when the transform is done in the JVM rather than in an external command.
|
||||
}
|
||||
|
||||
protected String[] buildArgs(File sourceFile, File targetFile, String[] args)
|
||||
{
|
||||
ArrayList<String> methodArgs = new ArrayList<>(args.length+2);
|
||||
StringJoiner sj = new StringJoiner(" ");
|
||||
for (String arg: args)
|
||||
{
|
||||
addArg(methodArgs, sj, arg);
|
||||
}
|
||||
|
||||
addFileArg(methodArgs, sj, sourceFile);
|
||||
addFileArg(methodArgs, sj, targetFile);
|
||||
|
||||
LogEntry.setOptions(sj.toString());
|
||||
|
||||
return methodArgs.toArray(new String[methodArgs.size()]);
|
||||
}
|
||||
|
||||
private void addArg(ArrayList<String> methodArgs, StringJoiner sj, String arg)
|
||||
{
|
||||
if (arg != null)
|
||||
{
|
||||
sj.add(arg);
|
||||
methodArgs.add(arg);
|
||||
}
|
||||
}
|
||||
|
||||
private void addFileArg(ArrayList<String> methodArgs, StringJoiner sj, File arg)
|
||||
{
|
||||
if (arg != null)
|
||||
{
|
||||
String path = arg.getAbsolutePath();
|
||||
int i = path.lastIndexOf('.');
|
||||
String ext = i == -1 ? "???" : path.substring(i+1);
|
||||
sj.add(ext);
|
||||
methodArgs.add(path);
|
||||
}
|
||||
}
|
||||
|
||||
protected void executeTransformCommand(String options, File sourceFile, File targetFile, Long timeout)
|
||||
{
|
||||
LogEntry.setOptions(options);
|
||||
|
||||
Map<String, String> properties = new HashMap<String, String>(5);
|
||||
properties.put("options", options);
|
||||
properties.put("source", sourceFile.getAbsolutePath());
|
||||
properties.put("target", targetFile.getAbsolutePath());
|
||||
|
||||
executeTransformCommand(properties, targetFile, timeout);
|
||||
}
|
||||
|
||||
public void executeTransformCommand(Map<String, String> properties, File targetFile, Long timeout)
|
||||
{
|
||||
timeout = timeout != null && timeout > 0 ? timeout : 0;
|
||||
|
@ -6,4 +6,5 @@ server.port = 8090
|
||||
logging.level.org.alfresco.transformer.LibreOfficeController=debug
|
||||
logging.level.org.alfresco.transformer.JodConverterSharedInstance=debug
|
||||
logging.level.org.alfresco.transformer.AlfrescoPdfRendererController=debug
|
||||
logging.level.org.alfresco.transformer.ImageMagickController=debug
|
||||
logging.level.org.alfresco.transformer.ImageMagickController=debug
|
||||
logging.level.org.alfresco.transformer.TikaController=debug
|
@ -33,12 +33,10 @@ import org.mockito.stubbing.Answer;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.mock.web.MockMultipartFile;
|
||||
import org.springframework.test.web.servlet.MockMvc;
|
||||
import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilder;
|
||||
import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
@ -83,7 +81,9 @@ public abstract class AbstractTransformerControllerTest
|
||||
protected AbstractTransformerController controller;
|
||||
|
||||
// Called by sub class
|
||||
public void mockTransformCommand(AbstractTransformerController controller, String sourceExtension, String targetExtension, String sourceMimetype) throws IOException
|
||||
public void mockTransformCommand(AbstractTransformerController controller, String sourceExtension,
|
||||
String targetExtension, String sourceMimetype,
|
||||
boolean readTargetFileBytes) throws IOException
|
||||
{
|
||||
this.controller = controller;
|
||||
this.sourceExtension = sourceExtension;
|
||||
@ -92,8 +92,8 @@ public abstract class AbstractTransformerControllerTest
|
||||
|
||||
expectedOptions = null;
|
||||
expectedSourceSuffix = null;
|
||||
expectedSourceFileBytes = Files.readAllBytes(getTestFile("quick."+sourceExtension, true).toPath());
|
||||
expectedTargetFileBytes = Files.readAllBytes(getTestFile("quick."+targetExtension, true).toPath());
|
||||
expectedSourceFileBytes = readTestFile(sourceExtension);
|
||||
expectedTargetFileBytes = readTargetFileBytes ? readTestFile(targetExtension) : null;
|
||||
sourceFile = new MockMultipartFile("file", "quick."+sourceExtension, sourceMimetype, expectedSourceFileBytes);
|
||||
|
||||
controller.setTransformCommand(mockTransformCommand);
|
||||
@ -159,6 +159,11 @@ public abstract class AbstractTransformerControllerTest
|
||||
when(mockExecutionResult.getStdOut()).thenReturn("STDOUT");
|
||||
}
|
||||
|
||||
protected byte[] readTestFile(String extension) throws IOException
|
||||
{
|
||||
return Files.readAllBytes(getTestFile("quick."+extension, true).toPath());
|
||||
}
|
||||
|
||||
protected File getTestFile(String testFilename, boolean required) throws IOException
|
||||
{
|
||||
ClassLoader classLoader = getClass().getClassLoader();
|
||||
@ -170,12 +175,26 @@ public abstract class AbstractTransformerControllerTest
|
||||
return testFileUrl == null ? null : new File(testFileUrl.getFile());
|
||||
}
|
||||
|
||||
protected MockHttpServletRequestBuilder mockMvcRequest(String url, MockMultipartFile sourceFile, String... params)
|
||||
{
|
||||
MockHttpServletRequestBuilder builder = MockMvcRequestBuilders.fileUpload("/transform").file(sourceFile);
|
||||
|
||||
if (params.length % 2 != 0)
|
||||
{
|
||||
throw new IllegalArgumentException("each param should have a name and value.");
|
||||
}
|
||||
for (int i=0; i<params.length; i+=2)
|
||||
{
|
||||
builder = builder.param(params[i], params[i+1]);
|
||||
}
|
||||
|
||||
return builder;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void simpleTransformTest() throws Exception
|
||||
{
|
||||
mockMvc.perform(MockMvcRequestBuilders.fileUpload("/transform")
|
||||
.file(sourceFile)
|
||||
.param("targetExtension", targetExtension))
|
||||
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension))
|
||||
.andExpect(status().is(200))
|
||||
.andExpect(content().bytes(expectedTargetFileBytes))
|
||||
.andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick."+targetExtension));
|
||||
@ -185,10 +204,7 @@ public abstract class AbstractTransformerControllerTest
|
||||
public void testDelayTest() throws Exception
|
||||
{
|
||||
long start = System.currentTimeMillis();
|
||||
mockMvc.perform(MockMvcRequestBuilders.fileUpload("/transform")
|
||||
.file(sourceFile)
|
||||
.param("targetExtension", targetExtension)
|
||||
.param("testDelay", "400"))
|
||||
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension, "testDelay", "400"))
|
||||
.andExpect(status().is(200))
|
||||
.andExpect(content().bytes(expectedTargetFileBytes))
|
||||
.andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick."+targetExtension));
|
||||
@ -201,9 +217,7 @@ public abstract class AbstractTransformerControllerTest
|
||||
@Test
|
||||
public void noTargetFileTest() throws Exception
|
||||
{
|
||||
mockMvc.perform(MockMvcRequestBuilders.fileUpload("/transform")
|
||||
.file(sourceFile)
|
||||
.param("targetExtension", "xxx"))
|
||||
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", "xxx"))
|
||||
.andExpect(status().is(500));
|
||||
}
|
||||
|
||||
@ -212,9 +226,7 @@ public abstract class AbstractTransformerControllerTest
|
||||
{
|
||||
when(mockExecutionResult.getExitValue()).thenReturn(1);
|
||||
|
||||
mockMvc.perform(MockMvcRequestBuilders.fileUpload("/transform")
|
||||
.file(sourceFile)
|
||||
.param("targetExtension", "xxx"))
|
||||
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", "xxx"))
|
||||
.andExpect(status().is(400))
|
||||
.andExpect(status().reason(containsString("Transformer exit code was not 0: \nSTDERR")));
|
||||
}
|
||||
@ -225,9 +237,7 @@ public abstract class AbstractTransformerControllerTest
|
||||
{
|
||||
sourceFile = new MockMultipartFile("file", "../quick."+sourceExtension, sourceMimetype, expectedSourceFileBytes);
|
||||
|
||||
mockMvc.perform(MockMvcRequestBuilders.fileUpload("/transform")
|
||||
.file(sourceFile)
|
||||
.param("targetExtension", targetExtension))
|
||||
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension))
|
||||
.andExpect(status().is(200))
|
||||
.andExpect(content().bytes(expectedTargetFileBytes))
|
||||
.andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick."+targetExtension));
|
||||
@ -239,9 +249,7 @@ public abstract class AbstractTransformerControllerTest
|
||||
{
|
||||
sourceFile = new MockMultipartFile("file", "../quick", sourceMimetype, expectedSourceFileBytes);
|
||||
|
||||
mockMvc.perform(MockMvcRequestBuilders.fileUpload("/transform")
|
||||
.file(sourceFile)
|
||||
.param("targetExtension", targetExtension))
|
||||
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension))
|
||||
.andExpect(status().is(200))
|
||||
.andExpect(content().bytes(expectedTargetFileBytes))
|
||||
.andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick."+targetExtension));
|
||||
@ -253,9 +261,7 @@ public abstract class AbstractTransformerControllerTest
|
||||
{
|
||||
sourceFile = new MockMultipartFile("file", "abc/", sourceMimetype, expectedSourceFileBytes);
|
||||
|
||||
mockMvc.perform(MockMvcRequestBuilders.fileUpload("/transform")
|
||||
.file(sourceFile)
|
||||
.param("targetExtension", targetExtension))
|
||||
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension))
|
||||
.andExpect(status().is(400))
|
||||
.andExpect(status().reason(containsString("The source filename was not supplied")));
|
||||
}
|
||||
@ -265,9 +271,7 @@ public abstract class AbstractTransformerControllerTest
|
||||
{
|
||||
sourceFile = new MockMultipartFile("file", "", sourceMimetype, expectedSourceFileBytes);
|
||||
|
||||
mockMvc.perform(MockMvcRequestBuilders.fileUpload("/transform")
|
||||
.file(sourceFile)
|
||||
.param("targetExtension", targetExtension))
|
||||
mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension))
|
||||
.andExpect(status().is(400))
|
||||
.andExpect(status().reason(containsString("The source filename was not supplied")));
|
||||
}
|
||||
@ -275,8 +279,7 @@ public abstract class AbstractTransformerControllerTest
|
||||
@Test
|
||||
public void noTargetExtensionTest() throws Exception
|
||||
{
|
||||
mockMvc.perform(MockMvcRequestBuilders.fileUpload("/transform")
|
||||
.file(sourceFile))
|
||||
mockMvc.perform(mockMvcRequest("/transform", sourceFile))
|
||||
.andExpect(status().is(400))
|
||||
.andExpect(status().reason(containsString("Request parameter targetExtension is missing")));
|
||||
}
|
||||
|
9
pom.xml
9
pom.xml
@ -19,7 +19,8 @@
|
||||
<dependency.pdfbox.version>2.0.8</dependency.pdfbox.version>
|
||||
<dependency.fabric8.version>3.5.37</dependency.fabric8.version>
|
||||
<dependency.spring-boot.version>1.5.12.RELEASE</dependency.spring-boot.version>
|
||||
<dependency.alfresco-core.version>7.2</dependency.alfresco-core.version>
|
||||
<dependency.alfresco-core.version>7.3</dependency.alfresco-core.version>
|
||||
<dependency.alfresco-data-model.version>8.8</dependency.alfresco-data-model.version>
|
||||
<dependency.alfresco-jodconverter-core.version>3.0.1.1</dependency.alfresco-jodconverter-core.version>
|
||||
<dependency.ch-qos-logback.version>1.2.3</dependency.ch-qos-logback.version>
|
||||
<env.project_version>${project.version}</env.project_version>
|
||||
@ -27,6 +28,7 @@
|
||||
|
||||
<modules>
|
||||
<module>alfresco-transformer-base</module>
|
||||
<module>alfresco-docker-tika</module>
|
||||
<module>alfresco-docker-alfresco-pdf-renderer</module>
|
||||
<module>alfresco-docker-imagemagick</module>
|
||||
<module>alfresco-docker-libreoffice</module>
|
||||
@ -68,6 +70,11 @@
|
||||
<artifactId>alfresco-core</artifactId>
|
||||
<version>${dependency.alfresco-core.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<artifactId>alfresco-data-model</artifactId>
|
||||
<version>${dependency.alfresco-data-model.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.alfresco</groupId>
|
||||
<artifactId>alfresco-jodconverter-core</artifactId>
|
||||
|
39
scripts/testImages.sh
Executable file
39
scripts/testImages.sh
Executable file
@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
# For each transform project, check the live probe in each docker image works.
|
||||
set -e
|
||||
|
||||
docker images
|
||||
echo
|
||||
|
||||
transformers=`ls | grep alfresco-docker- | sed 's/alfresco-docker-\(.*\)/\1/'`
|
||||
for transformer in $transformers
|
||||
do
|
||||
echo
|
||||
echo === $transformer ===
|
||||
repo=`docker images | awk '{print $1}' | grep $transformer | sort -u`
|
||||
echo docker run --rm -d -p 8090:8090 --name $transformer $repo:$tag
|
||||
docker run --rm -d -p 8090:8090 --name $transformer $repo:$tag >/dev/null
|
||||
|
||||
WAIT_INTERVAL=1
|
||||
COUNTER=0
|
||||
TIMEOUT=30
|
||||
t0=`date +%s`
|
||||
echo -n "Waiting for $transformer to start "
|
||||
until $(curl --output /dev/null --silent --fail http://localhost:8090/live) || [ "$COUNTER" -eq "$TIMEOUT" ]; do
|
||||
printf '.'
|
||||
sleep $WAIT_INTERVAL
|
||||
COUNTER=$(($COUNTER+$WAIT_INTERVAL))
|
||||
done
|
||||
t1=`date +%s`
|
||||
delta=$(($t1 - $t0))
|
||||
|
||||
docker stop $transformer > /dev/null
|
||||
|
||||
if (("$COUNTER" < "$TIMEOUT")) ; then
|
||||
echo " started in $delta seconds"
|
||||
else
|
||||
echo " did not start after $delta seconds"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
echo
|
Loading…
x
Reference in New Issue
Block a user