diff --git a/alfresco-docker-alfresco-pdf-renderer/Dockerfile b/alfresco-docker-alfresco-pdf-renderer/Dockerfile index 2824b46c..c2334389 100644 --- a/alfresco-docker-alfresco-pdf-renderer/Dockerfile +++ b/alfresco-docker-alfresco-pdf-renderer/Dockerfile @@ -3,7 +3,7 @@ # The container is only intended to be used with the Alfresco Enterprise editon which is covered by https://www.alfresco.com/legal/agreements and https://www.alfresco.com/terms-use. # alfresco-pdf-renderer uses the PDFium library from Google Inc. See the license at https://pdfium.googlesource.com/pdfium/+/master/LICENSE or in /pdfium.txt. -FROM quay.io/alfresco/alfresco-base-java:9 +FROM alfresco/alfresco-base-java:8 ENV ALFRESCO_PDF_RENDERER_LIB_RPM_URL=https://nexus.alfresco.com/nexus/service/local/repositories/releases/content/org/alfresco/alfresco-pdf-renderer/1.1/alfresco-pdf-renderer-1.1-linux.tgz ENV PDFIUM_LICENSE_FILE=https://github.com/Alfresco/acs-community-packaging/blob/master/distribution/src/main/resources/licenses/3rd-party/pdfium.txt diff --git a/alfresco-docker-alfresco-pdf-renderer/src/main/java/org/alfresco/transformer/AlfrescoPdfRendererController.java b/alfresco-docker-alfresco-pdf-renderer/src/main/java/org/alfresco/transformer/AlfrescoPdfRendererController.java index 06d8d05f..b2fc1ef1 100644 --- a/alfresco-docker-alfresco-pdf-renderer/src/main/java/org/alfresco/transformer/AlfrescoPdfRendererController.java +++ b/alfresco-docker-alfresco-pdf-renderer/src/main/java/org/alfresco/transformer/AlfrescoPdfRendererController.java @@ -155,16 +155,4 @@ public class AlfrescoPdfRendererController extends AbstractTransformerController return createAttachment(targetFilename, targetFile, testDelay); } - - private void executeTransformCommand(String options, File sourceFile, File targetFile, @RequestParam(value = "timeout", required = false) Long timeout) - { - LogEntry.setOptions(options); - - Map properties = new HashMap(5); - properties.put("options", options); - properties.put("source", sourceFile.getAbsolutePath()); - properties.put("target", targetFile.getAbsolutePath()); - - executeTransformCommand(properties, targetFile, timeout); - } } diff --git a/alfresco-docker-alfresco-pdf-renderer/src/test/java/org/alfresco/transformer/AlfrescoPdfRendererControllerTest.java b/alfresco-docker-alfresco-pdf-renderer/src/test/java/org/alfresco/transformer/AlfrescoPdfRendererControllerTest.java index 311b4638..6b2dc12c 100644 --- a/alfresco-docker-alfresco-pdf-renderer/src/test/java/org/alfresco/transformer/AlfrescoPdfRendererControllerTest.java +++ b/alfresco-docker-alfresco-pdf-renderer/src/test/java/org/alfresco/transformer/AlfrescoPdfRendererControllerTest.java @@ -51,7 +51,7 @@ public class AlfrescoPdfRendererControllerTest extends AbstractTransformerContro @Before public void before() throws IOException { - super.mockTransformCommand(controller, "pdf", "png", "application/pdf"); + super.mockTransformCommand(controller, "pdf", "png", "application/pdf", true); } @Test diff --git a/alfresco-docker-imagemagick/Dockerfile b/alfresco-docker-imagemagick/Dockerfile index 64e644b1..db121c6e 100644 --- a/alfresco-docker-imagemagick/Dockerfile +++ b/alfresco-docker-imagemagick/Dockerfile @@ -3,7 +3,7 @@ # The container is only intended to be used with the Alfresco Enterprise editon which is covered by https://www.alfresco.com/legal/agreements and https://www.alfresco.com/terms-use. # ImageMagick is from ImageMagick Studio LLC. See the license at http://www.imagemagick.org/script/license.php or in /ImageMagick-license.txt. -FROM quay.io/alfresco/alfresco-base-java:9 +FROM alfresco/alfresco-base-java:8 ENV IMAGEMAGICK_RPM_URL=https://nexus.alfresco.com/nexus/service/local/repositories/thirdparty/content/org/imagemagick/imagemagick-distribution/7.0.7-27/imagemagick-distribution-7.0.7-27-linux.rpm ENV IMAGEMAGICK_LIB_RPM_URL=https://nexus.alfresco.com/nexus/service/local/repositories/thirdparty/content/org/imagemagick/imagemagick-distribution/7.0.7-27/imagemagick-distribution-7.0.7-27-libs-linux.rpm diff --git a/alfresco-docker-imagemagick/src/test/java/org/alfresco/transformer/ImageMagickControllerTest.java b/alfresco-docker-imagemagick/src/test/java/org/alfresco/transformer/ImageMagickControllerTest.java index 2d8a6029..5cf5025a 100644 --- a/alfresco-docker-imagemagick/src/test/java/org/alfresco/transformer/ImageMagickControllerTest.java +++ b/alfresco-docker-imagemagick/src/test/java/org/alfresco/transformer/ImageMagickControllerTest.java @@ -51,7 +51,7 @@ public class ImageMagickControllerTest extends AbstractTransformerControllerTest @Before public void before() throws IOException { - super.mockTransformCommand(controller, "jpg", "png", "image/jpg"); + super.mockTransformCommand(controller, "jpg", "png", "image/jpg", true); } @Test diff --git a/alfresco-docker-libreoffice/Dockerfile b/alfresco-docker-libreoffice/Dockerfile index 4a9d264b..80aa75f7 100644 --- a/alfresco-docker-libreoffice/Dockerfile +++ b/alfresco-docker-libreoffice/Dockerfile @@ -3,7 +3,7 @@ # The container is only intended to be used with the Alfresco Enterprise editon which is covered by https://www.alfresco.com/legal/agreements and https://www.alfresco.com/terms-use. # LibreOffice is from The Document Foundation. See the license at https://www.libreoffice.org/download/license/ or in /libreoffice.txt. -FROM quay.io/alfresco/alfresco-base-java:9 +FROM alfresco/alfresco-base-java:8 ENV LIBREOFFICE_RPM_URL=https://nexus.alfresco.com/nexus/service/local/repositories/thirdparty/content/org/libreoffice/libreoffice-dist/5.4.6/libreoffice-dist-5.4.6-linux.gz ENV LIBREOFFICE_LICENSE_FILE=https://github.com/Alfresco/acs-community-packaging/blob/master/distribution/src/main/resources/licenses/3rd-party/libreoffice.txt diff --git a/alfresco-docker-tika/.maven-dockerignore b/alfresco-docker-tika/.maven-dockerignore new file mode 100644 index 00000000..112bd182 --- /dev/null +++ b/alfresco-docker-tika/.maven-dockerignore @@ -0,0 +1 @@ +target/docker/ \ No newline at end of file diff --git a/alfresco-docker-tika/Dockerfile b/alfresco-docker-tika/Dockerfile new file mode 100644 index 00000000..fb20d99b --- /dev/null +++ b/alfresco-docker-tika/Dockerfile @@ -0,0 +1,20 @@ +# Image provides a container in which to run Tika transformations for Alfresco Enterprise Content Services. + +# The container is only intended to be used with the Alfresco Enterprise editon which is covered by https://www.alfresco.com/legal/agreements and https://www.alfresco.com/terms-use. +# Tika is from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. + +FROM alfresco/alfresco-base-java:8 + +ENV APACHE_LICENSE_FILE=https://github.com/Alfresco/acs-community-packaging/blob/master/distribution/src/main/resources/licenses/3rd-party/Apache%202.0.txt + +COPY target/alfresco-docker-tika-${env.project_version}.jar /usr/bin + +RUN ln /usr/bin/alfresco-docker-tika-${env.project_version}.jar /usr/bin/alfresco-docker-tika.jar && \ + yum install -y wget && \ + wget $APACHE_LICENSE_FILE && \ + yum remove -y wget && \ + yum clean all + +EXPOSE 8090 + +ENTRYPOINT java -jar /usr/bin/alfresco-docker-tika.jar diff --git a/alfresco-docker-tika/LICENSES.md b/alfresco-docker-tika/LICENSES.md new file mode 100644 index 00000000..dfdb7c47 --- /dev/null +++ b/alfresco-docker-tika/LICENSES.md @@ -0,0 +1,7 @@ +### Licenses + +* The code in the alfresco-docker-imagemagick project is only intended to be use with the Alfresco Enterprise + Content Repository which is covered by [https://www.alfresco.com/legal/agreements](https://www.alfresco.com/legal/agreements) and [https://www.alfresco.com/terms-use](https://www.alfresco.com/terms-use) +* Tika is from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0 or the +[Apache 2.0.txt](https://github.com/Alfresco/acs-community-packaging/blob/master/distribution/src/main/resources/licenses/3rd-party/Apache%202.0.txt) +file placed in the root directory of the docker image. diff --git a/alfresco-docker-tika/pom.xml b/alfresco-docker-tika/pom.xml new file mode 100644 index 00000000..96b4c715 --- /dev/null +++ b/alfresco-docker-tika/pom.xml @@ -0,0 +1,268 @@ + + 4.0.0 + alfresco-docker-tika + Alfresco Docker Tika + jar + + + org.alfresco + alfresco-docker-transformers + 1.2-SNAPSHOT + ../pom.xml + + + + alfresco/alfresco-tika + quay.io + 3.17 + + + + + org.alfresco + alfresco-transformer-base + ${project.version} + + + org.alfresco + alfresco-transformer-base + ${project.version} + tests + test-jar + test + + + org.springframework.boot + spring-boot-starter-thymeleaf + + + org.springframework.boot + spring-boot-starter-test + test + + + org.alfresco + alfresco-core + + + org.alfresco + alfresco-data-model + + + + org.slf4j + slf4j-api + 1.7.24 + + + org.slf4j + slf4j-log4j12 + 1.7.25 + + + + + org.apache.tika + tika-core + 1.17-20180201-alfresco-patched + + + org.apache.tika + tika-parsers + 1.17-20180201-alfresco-patched + + + com.tdunning + json + + + + + + + org.apache.poi + poi + ${dependency.poi.version} + + + org.apache.poi + poi-ooxml + ${dependency.poi.version} + + + org.apache.poi + poi-scratchpad + ${dependency.poi.version} + + + + + + + + org.springframework.boot + spring-boot-maven-plugin + ${dependency.spring-boot.version} + + + + repackage + + + + + + io.fabric8 + fabric8-maven-plugin + ${dependency.fabric8.version} + + + + ${image.name}:${image.tag} + + ${project.basedir}/ + + + + + + + + + + + + enterpriseDocker + + + + io.fabric8 + fabric8-maven-plugin + ${dependency.fabric8.version} + + + build-image + install + + build + + + + + + + + + + internal + + + + io.fabric8 + fabric8-maven-plugin + ${dependency.fabric8.version} + + + + ${image.registry}/${image.name}:${image.tag} + + ${project.basedir}/ + + + + + + + build-push-image + install + + build + push + + + + + + + + + + master + + + + io.fabric8 + fabric8-maven-plugin + ${dependency.fabric8.version} + + + + ${image.registry}/${image.name} + + ${project.basedir}/ + + + + ${image.name} + + ${project.basedir}/ + + + + + + + build-push-image + install + + build + push + + + + + + + + + + release + + + + io.fabric8 + fabric8-maven-plugin + ${dependency.fabric8.version} + + + + ${image.name}:${project.version} + ${image.registry} + + ${project.basedir}/ + + + + ${image.name}:${project.version} + + ${project.basedir}/ + + + + + + + build-push-image + deploy + + build + push + + + + + + + + + diff --git a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Application.java b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Application.java new file mode 100644 index 00000000..22cbefc4 --- /dev/null +++ b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Application.java @@ -0,0 +1,27 @@ +/* + * #%L + * Alfresco Enterprise Repository + * %% + * Copyright (C) 2005 - 2018 Alfresco Software Limited + * %% + * License rights for this program may be obtained from Alfresco Software, Ltd. + * pursuant to a written agreement and any use of this program without such an + * agreement is prohibited. + * #L% + */ +package org.alfresco.transformer; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.boot.autoconfigure.EnableAutoConfiguration; +import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration; + +@SpringBootApplication +@EnableAutoConfiguration(exclude={DataSourceAutoConfiguration.class}) +public class Application +{ + public static void main(String[] args) + { + SpringApplication.run(Application.class, args); + } +} diff --git a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java new file mode 100644 index 00000000..a7091002 --- /dev/null +++ b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/Tika.java @@ -0,0 +1,801 @@ +/* + * #%L + * Alfresco Enterprise Repository + * %% + * Copyright (C) 2005 - 2018 Alfresco Software Limited + * %% + * License rights for this program may be obtained from Alfresco Software, Ltd. + * pursuant to a written agreement and any use of this program without such an + * agreement is prohibited. + * #L% + */ +package org.alfresco.transformer; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.DocumentSelector; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.EmptyParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.OfficeParser; +import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; +import org.apache.tika.parser.pdf.PDFParser; +import org.apache.tika.parser.pkg.PackageParser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ExpandedTitleContentHandler; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TransformerHandler; +import javax.xml.transform.stream.StreamResult; +import java.io.*; +import java.net.URL; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; + +import static org.alfresco.repo.content.MimetypeMap.*; + +/** + * Stripped down command line Tika transformers. Not actually run as a separate process, but the code fits the patten + * used by transformers that do. + *
+ *
+ *           Archive 0 ms
+ *               1) cpio html [100] unlimited
+ *               2) cpio txt   [50] unlimited
+ *               3) cpio xhtml [100] unlimited
+ *               4) cpio xml  [100] unlimited
+ *               5) jar  html [100] unlimited
+ *               6) jar  txt   [50] unlimited
+ *               7) jar  xhtml [100] unlimited
+ *               8) jar  xml  [100] unlimited
+ *               9) tar  html [100] unlimited
+ *              10) tar  txt   [50] unlimited
+ *              11) tar  xhtml [100] unlimited
+ *              12) tar  xml  [100] unlimited
+ *              13) zip  html [100] unlimited
+ *              14) zip  txt   [50] unlimited
+ *              15) zip  xhtml [100] unlimited
+ *              16) zip  xml  [100] unlimited
+ *           PdfBox 0 ms
+ *               1) pdf  html [110] unlimited
+ *               2) pdf  txt   [50] 25 MB
+ *               3) pdf  xhtml [110] unlimited
+ *               4) pdf  xml  [110] unlimited
+ *           OutlookMsg 0 ms
+ *               1) msg  html [125] unlimited
+ *               2) msg  txt  [125] unlimited
+ *               3) msg  xhtml [125] unlimited
+ *               4) msg  xml  [125] unlimited
+ *           PdfBox 0 ms
+ *               1) pdf  html [110] unlimited
+ *               2) pdf  txt   [50] 25 MB
+ *               3) pdf  xhtml [110] unlimited
+ *               4) pdf  xml  [110] unlimited
+ *           Office 0 ms
+ *               1) doc  html [130] unlimited
+ *               2) doc  txt  [130] unlimited
+ *               3) doc  xhtml [130] unlimited
+ *               4) doc  xml  [130] unlimited
+ *               5) mpp  html [130] unlimited
+ *               6) mpp  txt  [130] unlimited
+ *               7) mpp  xhtml [130] unlimited
+ *               8) mpp  xml  [130] unlimited
+ *               9) msg  html [130] unlimited
+ *              10) msg  txt  [130] unlimited
+ *              11) msg  xhtml [130] unlimited
+ *              12) msg  xml  [130] unlimited
+ *              13) ppt  html [130] unlimited
+ *              14) ppt  txt  [130] unlimited
+ *              15) ppt  xhtml [130] unlimited
+ *              16) ppt  xml  [130] unlimited
+ *              17) vsd  html [130] unlimited
+ *              18) vsd  txt  [130] unlimited
+ *              19) vsd  xhtml [130] unlimited
+ *              20) vsd  xml  [130] unlimited
+ *           Poi 0 ms
+ *               1) xls  csv  [130] unlimited
+ *               2) xls  html [130] unlimited
+ *               3) xls  txt  [130] unlimited
+ *               4) xls  xhtml [130] unlimited
+ *               5) xls  xml  [130] unlimited
+ *               6) xlsx csv  [130] unlimited
+ *               7) xlsx html [130] unlimited
+ *               8) xlsx txt  [130] unlimited
+ *               9) xlsx xhtml [130] unlimited
+ *              10) xlsx xml  [130] unlimited
+ *           OOXML 0 ms
+ *               1) docm html [130] unlimited
+ *               2) docm txt  [130] unlimited
+ *               3) docm xhtml [130] unlimited
+ *               4) docm xml  [130] unlimited
+ *               5) docx html [130] unlimited
+ *               6) docx txt  [130] unlimited
+ *               7) docx xhtml [130] unlimited
+ *               8) docx xml  [130] unlimited
+ *               9) dotm html [130] unlimited
+ *              10) dotm txt  [130] unlimited
+ *              11) dotm xhtml [130] unlimited
+ *              12) dotm xml  [130] unlimited
+ *              13) dotx html [130] unlimited
+ *              14) dotx txt  [130] unlimited
+ *              15) dotx xhtml [130] unlimited
+ *              16) dotx xml  [130] unlimited
+ *              17) potm html [130] unlimited
+ *              18) potm txt  [130] unlimited
+ *              19) potm xhtml [130] unlimited
+ *              20) potm xml  [130] unlimited
+ *              21) potx html [130] unlimited
+ *              22) potx txt  [130] unlimited
+ *              23) potx xhtml [130] unlimited
+ *              24) potx xml  [130] unlimited
+ *              25) ppam html [130] unlimited
+ *              26) ppam txt  [130] unlimited
+ *              27) ppam xhtml [130] unlimited
+ *              28) ppam xml  [130] unlimited
+ *              29) ppsm html [130] unlimited
+ *              30) ppsm txt  [130] unlimited
+ *              31) ppsm xhtml [130] unlimited
+ *              32) ppsm xml  [130] unlimited
+ *              33) ppsx html [130] unlimited
+ *              34) ppsx txt  [130] unlimited
+ *              35) ppsx xhtml [130] unlimited
+ *              36) ppsx xml  [130] unlimited
+ *              37) pptm html [130] unlimited
+ *              38) pptm txt  [130] unlimited
+ *              39) pptm xhtml [130] unlimited
+ *              40) pptm xml  [130] unlimited
+ *              41) pptx html [130] unlimited
+ *              42) pptx txt  [130] unlimited
+ *              43) pptx xhtml [130] unlimited
+ *              44) pptx xml  [130] unlimited
+ *              45) sldm html [130] unlimited
+ *              46) sldm txt  [130] unlimited
+ *              47) sldm xhtml [130] unlimited
+ *              48) sldm xml  [130] unlimited
+ *              49) sldx html [130] unlimited
+ *              50) sldx txt  [130] unlimited
+ *              51) sldx xhtml [130] unlimited
+ *              52) sldx xml  [130] unlimited
+ *              53) xlam html [130] unlimited
+ *              54) xlam txt  [130] unlimited
+ *              55) xlam xhtml [130] unlimited
+ *              56) xlam xml  [130] unlimited
+ *              57) xlsb html [130] unlimited
+ *              58) xlsb txt  [130] unlimited
+ *              59) xlsb xhtml [130] unlimited
+ *              60) xlsb xml  [130] unlimited
+ *              61) xlsm html [130] unlimited
+ *              62) xlsm txt  [130] unlimited
+ *              63) xlsm xhtml [130] unlimited
+ *              64) xlsm xml  [130] unlimited
+ *              65) xlsx html [130] unlimited
+ *              66) xlsx txt  [130] unlimited
+ *              67) xlsx xhtml [130] unlimited
+ *              68) xlsx xml  [130] unlimited
+ *              69) xltm html [130] unlimited
+ *              70) xltm txt  [130] unlimited
+ *              71) xltm xhtml [130] unlimited
+ *              72) xltm xml  [130] unlimited
+ *              73) xltx html [130] unlimited
+ *              74) xltx txt  [130] unlimited
+ *              75) xltx xhtml [130] unlimited
+ *              76) xltx xml  [130] unlimited
+ *           TikaAuto 0 ms
+ *               1) cdf  html [120] unlimited
+ *               2) cdf  txt  [120] unlimited
+ *               3) cdf  xhtml [120] unlimited
+ *               4) cdf  xml  [120] unlimited
+ *               5) cpio html [120] unlimited
+ *               6) cpio txt  [120] unlimited
+ *               7) cpio xhtml [120] unlimited
+ *               8) cpio xml  [120] unlimited
+ *               9) doc  html [120] unlimited
+ *              10) doc  txt  [120] unlimited
+ *              11) doc  xhtml [120] unlimited
+ *              12) doc  xml  [120] unlimited
+ *              13) docm html [120] unlimited
+ *              14) docm txt  [120] unlimited
+ *              15) docm xhtml [120] unlimited
+ *              16) docm xml  [120] unlimited
+ *              17) docx html [120] unlimited
+ *              18) docx txt  [120] unlimited
+ *              19) docx xhtml [120] unlimited
+ *              20) docx xml  [120] unlimited
+ *              21) dotm html [120] unlimited
+ *              22) dotm txt  [120] unlimited
+ *              23) dotm xhtml [120] unlimited
+ *              24) dotm xml  [120] unlimited
+ *              25) dotx html [120] unlimited
+ *              26) dotx txt  [120] unlimited
+ *              27) dotx xhtml [120] unlimited
+ *              28) dotx xml  [120] unlimited
+ *              29) gzip html [120] unlimited
+ *              30) gzip txt  [120] unlimited
+ *              31) gzip xhtml [120] unlimited
+ *              32) gzip xml  [120] unlimited
+ *              33) hdf  html [120] unlimited
+ *              34) hdf  txt  [120] unlimited
+ *              35) hdf  xhtml [120] unlimited
+ *              36) hdf  xml  [120] unlimited
+ *              37) html html [120] unlimited
+ *              38) html txt  [120] unlimited
+ *              39) html xhtml [120] unlimited
+ *              40) html xml  [120] unlimited
+ *              41) jar  html [120] unlimited
+ *              42) jar  txt  [120] unlimited
+ *              43) jar  xhtml [120] unlimited
+ *              44) jar  xml  [120] unlimited
+ *              45) java html [120] unlimited
+ *              46) java txt  [120] unlimited
+ *              47) java xhtml [120] unlimited
+ *              48) java xml  [120] unlimited
+ *              49) key  html [120] unlimited
+ *              50) key  txt  [120] unlimited
+ *              51) key  xhtml [120] unlimited
+ *              52) key  xml  [120] unlimited
+ *              53) mpp  html [120] unlimited
+ *              54) mpp  txt  [120] unlimited
+ *              55) mpp  xhtml [120] unlimited
+ *              56) mpp  xml  [120] unlimited
+ *              57) numbers html [120] unlimited
+ *              58) numbers txt  [120] unlimited
+ *              59) numbers xhtml [120] unlimited
+ *              60) numbers xml  [120] unlimited
+ *              61) odc  html [120] unlimited
+ *              62) odc  txt  [120] unlimited
+ *              63) odc  xhtml [120] unlimited
+ *              64) odc  xml  [120] unlimited
+ *              65) odi  html [120] unlimited
+ *              66) odi  txt  [120] unlimited
+ *              67) odi  xhtml [120] unlimited
+ *              68) odi  xml  [120] unlimited
+ *              69) odm  html [120] unlimited
+ *              70) odm  txt  [120] unlimited
+ *              71) odm  xhtml [120] unlimited
+ *              72) odm  xml  [120] unlimited
+ *              73) odp  html [120] unlimited
+ *              74) odp  txt  [120] unlimited
+ *              75) odp  xhtml [120] unlimited
+ *              76) odp  xml  [120] unlimited
+ *              77) ods  html [120] unlimited
+ *              78) ods  txt  [120] unlimited
+ *              79) ods  xhtml [120] unlimited
+ *              80) ods  xml  [120] unlimited
+ *              81) odt  html [120] unlimited
+ *              82) odt  txt  [120] unlimited
+ *              83) odt  xhtml [120] unlimited
+ *              84) odt  xml  [120] unlimited
+ *              85) ogx  html [120] unlimited
+ *              86) ogx  txt  [120] unlimited
+ *              87) ogx  xhtml [120] unlimited
+ *              88) ogx  xml  [120] unlimited
+ *              89) oth  html [120] unlimited
+ *              90) oth  txt  [120] unlimited
+ *              91) oth  xhtml [120] unlimited
+ *              92) oth  xml  [120] unlimited
+ *              93) otp  html [120] unlimited
+ *              94) otp  txt  [120] unlimited
+ *              95) otp  xhtml [120] unlimited
+ *              96) otp  xml  [120] unlimited
+ *              97) ots  html [120] unlimited
+ *              98) ots  txt  [120] unlimited
+ *              99) ots  xhtml [120] unlimited
+ *             100) ots  xml  [120] unlimited
+ *             101) ott  html [120] unlimited
+ *             102) ott  txt  [120] unlimited
+ *             103) ott  xhtml [120] unlimited
+ *             104) ott  xml  [120] unlimited
+ *             105) pages html [120] unlimited
+ *             106) pages txt  [120] unlimited
+ *             107) pages xhtml [120] unlimited
+ *             108) pages xml  [120] unlimited
+ *             109) pdf  html [120] unlimited
+ *             110) pdf  txt  [120] 25 MB
+ *             111) pdf  xhtml [120] unlimited
+ *             112) pdf  xml  [120] unlimited
+ *             113) potm html [120] unlimited
+ *             114) potm txt  [120] unlimited
+ *             115) potm xhtml [120] unlimited
+ *             116) potm xml  [120] unlimited
+ *             117) potx html [120] unlimited
+ *             118) potx txt  [120] unlimited
+ *             119) potx xhtml [120] unlimited
+ *             120) potx xml  [120] unlimited
+ *             121) ppam html [120] unlimited
+ *             122) ppam txt  [120] unlimited
+ *             123) ppam xhtml [120] unlimited
+ *             124) ppam xml  [120] unlimited
+ *             125) ppsm html [120] unlimited
+ *             126) ppsm txt  [120] unlimited
+ *             127) ppsm xhtml [120] unlimited
+ *             128) ppsm xml  [120] unlimited
+ *             129) ppsx html [120] unlimited
+ *             130) ppsx txt  [120] unlimited
+ *             131) ppsx xhtml [120] unlimited
+ *             132) ppsx xml  [120] unlimited
+ *             133) ppt  html [120] unlimited
+ *             134) ppt  txt  [120] unlimited
+ *             135) ppt  xhtml [120] unlimited
+ *             136) ppt  xml  [120] unlimited
+ *             137) pptm html [120] unlimited
+ *             138) pptm txt  [120] unlimited
+ *             139) pptm xhtml [120] unlimited
+ *             140) pptm xml  [120] unlimited
+ *             141) pptx html [120] unlimited
+ *             142) pptx txt  [120] unlimited
+ *             143) pptx xhtml [120] unlimited
+ *             144) pptx xml  [120] unlimited
+ *             145) rar  html [120] unlimited
+ *             146) rar  txt  [120] unlimited
+ *             147) rar  xhtml [120] unlimited
+ *             148) rar  xml  [120] unlimited
+ *             149) rss  html [120] unlimited
+ *             150) rss  txt  [120] unlimited
+ *             151) rss  xhtml [120] unlimited
+ *             152) rss  xml  [120] unlimited
+ *             153) rtf  html [120] unlimited
+ *             154) rtf  txt  [120] unlimited
+ *             155) rtf  xhtml [120] unlimited
+ *             156) rtf  xml  [120] unlimited
+ *             157) sldm html [120] unlimited
+ *             158) sldm txt  [120] unlimited
+ *             159) sldm xhtml [120] unlimited
+ *             160) sldm xml  [120] unlimited
+ *             161) sldx html [120] unlimited
+ *             162) sldx txt  [120] unlimited
+ *             163) sldx xhtml [120] unlimited
+ *             164) sldx xml  [120] unlimited
+ *             165) sxw  html [120] unlimited
+ *             166) sxw  txt  [120] unlimited
+ *             167) sxw  xhtml [120] unlimited
+ *             168) sxw  xml  [120] unlimited
+ *             169) txt  html [120] unlimited
+ *             170) txt  txt  [120] unlimited
+ *             171) txt  xhtml [120] unlimited
+ *             172) txt  xml  [120] unlimited
+ *             173) vsd  html [120] unlimited
+ *             174) vsd  txt  [120] unlimited
+ *             175) vsd  xhtml [120] unlimited
+ *             176) vsd  xml  [120] unlimited
+ *             177) xhtml html [120] unlimited
+ *             178) xhtml txt  [120] unlimited
+ *             179) xhtml xhtml [120] unlimited
+ *             180) xhtml xml  [120] unlimited
+ *             181) xlam html [120] unlimited
+ *             182) xlam txt  [120] unlimited
+ *             183) xlam xhtml [120] unlimited
+ *             184) xlam xml  [120] unlimited
+ *             185) xls  html [120] unlimited
+ *             186) xls  txt  [120] unlimited
+ *             187) xls  xhtml [120] unlimited
+ *             188) xls  xml  [120] unlimited
+ *             189) xlsb html [120] unlimited
+ *             190) xlsb txt  [120] unlimited
+ *             191) xlsb xhtml [120] unlimited
+ *             192) xlsb xml  [120] unlimited
+ *             193) xlsm html [120] unlimited
+ *             194) xlsm txt  [120] unlimited
+ *             195) xlsm xhtml [120] unlimited
+ *             196) xlsm xml  [120] unlimited
+ *             197) xlsx html [120] unlimited
+ *             198) xlsx txt  [120] unlimited
+ *             199) xlsx xhtml [120] unlimited
+ *             200) xlsx xml  [120] unlimited
+ *             201) xltm html [120] unlimited
+ *             202) xltm txt  [120] unlimited
+ *             203) xltm xhtml [120] unlimited
+ *             204) xltm xml  [120] unlimited
+ *             205) xltx html [120] unlimited
+ *             206) xltx txt  [120] unlimited
+ *             207) xltx xhtml [120] unlimited
+ *             208) xltx xml  [120] unlimited
+ *             209) xml  html [120] unlimited
+ *             210) xml  txt  [120] unlimited
+ *             211) xml  xhtml [120] unlimited
+ *             212) xml  xml  [120] unlimited
+ *             213) z    html [120] unlimited
+ *             214) z    txt  [120] unlimited
+ *             215) z    xhtml [120] unlimited
+ *             216) z    xml  [120] unlimited
+ *           TextMining 0 ms
+ *               1) doc  html [130] unlimited
+ *               2) doc  txt   [50] unlimited
+ *               3) doc  xhtml [130] unlimited
+ *               4) doc  xml  [130] unlimited
+ * 
+ */ +public class Tika +{ + public static final String ARCHIVE = "Archive"; + public static final String OUTLOOK_MSG = "OutlookMsg"; + public static final String PDF_BOX = "PdfBox"; + public static final String POI_OFFICE = "Office"; + public static final String POI = "Poi"; + public static final String POI_OO_XML = "OOXML"; + public static final String TIKA_AUTO = "TikaAuto"; + public static final String TEXT_MINING = "TextMining"; + + public static final List TRANSFORM_NAMES = Arrays.asList( + ARCHIVE, OUTLOOK_MSG, PDF_BOX, POI_OFFICE, POI, POI_OO_XML, TIKA_AUTO, TEXT_MINING); + + public static final String TARGET_MIMETYPE = "--targetMimetype="; + public static final String TARGET_ENCODING = "--targetEncoding="; + public static final String INCLUDE_CONTENTS = "--includeContents"; + + public static final String CSV = "csv"; + public static final String DOC = "doc"; + public static final String DOCX = "docx"; + public static final String HTML = "html"; + public static final String MSG = "msg"; + public static final String PDF = "pdf"; + public static final String PPTX = "pptx"; + public static final String TXT = "txt"; + public static final String XHTML = "xhtml"; + public static final String XSLX = "xslx"; + public static final String XML = "xml"; + public static final String ZIP = "zip"; + + private Parser packageParser = new PackageParser(); + private Parser pdfParser = new PDFParser(); + private Parser officeParser = new OfficeParser(); + private Parser autoDetectParser; + private Parser ooXmlParser = new OOXMLParser(); + private Parser tikaOfficeDetectParser = new TikaOfficeDetectParser(); + + private DocumentSelector pdfBoxEmbededDocumentSelector = new DocumentSelector() + { + private List disabledMediaTypes = Arrays.asList(new String[] {MIMETYPE_IMAGE_JPEG, MIMETYPE_IMAGE_TIFF, MIMETYPE_IMAGE_PNG}); + + @Override + public boolean select(Metadata metadata) + { + String contentType = metadata.get(Metadata.CONTENT_TYPE); + if (contentType == null || contentType.equals("") || disabledMediaTypes == null) + { + return true; + } + return !disabledMediaTypes.contains(contentType); + } + }; + + public Tika() throws TikaException, IOException, SAXException + { + ClassLoader classLoader = getClass().getClassLoader(); + URL tikaConfigXml = classLoader.getResource("tika-config.xml"); + TikaConfig tikaConfig = new TikaConfig(tikaConfigXml); + autoDetectParser = new AutoDetectParser(tikaConfig); + } + + // Method included for developer testing + public static void main(String[] args) + { + long start = System.currentTimeMillis(); + try + { + new Tika().transform(args); + } + catch (IllegalArgumentException e) + { + System.err.println("ERROR "+e.getMessage()); + System.exit(-1); + } + catch (IllegalStateException | TikaException | IOException | SAXException e) + { + System.err.println("ERROR "+e.getMessage()); + e.printStackTrace(); + System.exit(-2); + } + System.out.println("Finished in "+(System.currentTimeMillis()-start)+"ms"); + } + + // Extracts parameters form args + public void transform(String[] args) + { + String transform = null; + String targetMimetype = null; + String targetEncoding = null; + String sourceFilename = null; + String targetFilename = null; + Boolean includeContents = null; + + for (String arg: args) + { + if (arg.startsWith("--")) + { + if (INCLUDE_CONTENTS.startsWith(arg)) + { + getValue(arg, false, includeContents, INCLUDE_CONTENTS); + includeContents = true; + } + else if (arg.startsWith(TARGET_ENCODING)) + { + targetEncoding = getValue(arg, true, targetEncoding, TARGET_ENCODING); + } + else if (arg.startsWith(TARGET_MIMETYPE)) + { + targetMimetype = getValue(arg, true, targetMimetype, TARGET_MIMETYPE); + } + else + { + throw new IllegalArgumentException("Unexpected argument "+arg); + } + } + else + { + if (transform == null) + { + transform = arg; + } + else if (sourceFilename == null) + { + sourceFilename = arg; + } + else if (targetFilename == null) + { + targetFilename = arg; + } + else + { + throw new IllegalArgumentException("Unexpected argument "+arg); + } + } + } + if (targetFilename == null) + { + throw new IllegalArgumentException("Missing arguments"); + } + includeContents = includeContents == null ? false : includeContents; + + transform(transform, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding); + } + + private String getValue(String arg, boolean valueExpected, Object value, String optionName) + { + if (value != null) + { + throw new IllegalArgumentException("Duplicate "+optionName); + } + String stringValue = arg.substring(optionName.length()).trim(); + if (!valueExpected && stringValue.length() > 0) + { + throw new IllegalArgumentException("Unexpected value with "+optionName); + } + if (valueExpected && stringValue.length() == 0) + { + throw new IllegalArgumentException("Expected value with "+optionName); + } + return stringValue; + } + + // Adds transform specific values such as parser and documentSelector. + private void transform(String transform, Boolean includeContents, + String sourceFilename, + String targetFilename, String targetMimetype, String targetEncoding) + { + Parser parser = null; + DocumentSelector documentSelector = null; + + switch(transform) + { + case ARCHIVE: + parser = packageParser; + break; + case OUTLOOK_MSG: + case POI_OFFICE: + case TEXT_MINING: + parser = officeParser; + break; + case PDF_BOX: + parser = pdfParser; + documentSelector = pdfBoxEmbededDocumentSelector; + break; + case POI: + parser = tikaOfficeDetectParser; + break; + case POI_OO_XML: + parser = ooXmlParser; + break; + case TIKA_AUTO: + parser = autoDetectParser; + break; + } + + transform(parser, documentSelector, includeContents, sourceFilename, targetFilename, targetMimetype, targetEncoding); + } + + + private void transform(Parser parser, DocumentSelector documentSelector, Boolean includeContents, + String sourceFilename, + String targetFilename, String targetMimetype, String targetEncoding) + { + InputStream is = null; + OutputStream os = null; + Writer ow = null; + + try + { + is = new BufferedInputStream(new FileInputStream(sourceFilename)); + os = new FileOutputStream(targetFilename); + ow = new BufferedWriter(new OutputStreamWriter(os, targetEncoding)); + Metadata metadata = new Metadata(); + ParseContext context = buildParseContext(documentSelector, includeContents); + ContentHandler handler = getContentHandler(targetMimetype, ow); + + parser.parse(is, handler, metadata, context); + } + catch (SAXException | TikaException | IOException e) + { + throw new IllegalStateException(e.getMessage(), e); + } + finally + { + if (is != null) + { + try { is.close(); } catch (Throwable e) {} + } + if (os != null) + { + try { os.close(); } catch (Throwable e) {} + } + if (ow != null) + { + try { ow.close(); } catch (Throwable e) {} + } + } + } + + protected ContentHandler getContentHandler(String targetMimetype, Writer output) + { + try + { + ContentHandler handler; + if (MIMETYPE_TEXT_PLAIN.equals(targetMimetype)) + { + handler = new BodyContentHandler(output); + } + else + { + SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); + TransformerHandler transformerHandler = null; + transformerHandler = factory.newTransformerHandler(); + transformerHandler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); + transformerHandler.setResult(new StreamResult(output)); + handler = transformerHandler; + + if (MIMETYPE_HTML.equals(targetMimetype)) + { + transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, HTML); + return new ExpandedTitleContentHandler(transformerHandler); + } + else if (MIMETYPE_XHTML.equals(targetMimetype) || + MIMETYPE_XML.equals(targetMimetype)) + { + transformerHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, XML); + } + else if (MIMETYPE_TEXT_CSV.equals(targetMimetype)) + { + handler = new CsvContentHandler(output); + } + else + { + throw new IllegalArgumentException("Invalid target mimetype " + targetMimetype); + } + } + return handler; + } + catch (TransformerConfigurationException e) + { + throw new IllegalStateException(e.getMessage(), e); + } + } + + /** + * A wrapper around the normal Tika BodyContentHandler for CSV rather encoding than tab separated. + */ + protected static class CsvContentHandler extends BodyContentHandler { + private static final char[] comma = new char[]{ ',' }; + private static final Pattern all_nums = Pattern.compile("[\\d\\.\\-\\+]+"); + + private boolean inCell = false; + private boolean needsComma = false; + + protected CsvContentHandler(Writer output) { + super(output); + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + if(length == 1 && ch[0] == '\t') { + // Ignore tabs, as they mess up the CSV output + } else { + super.ignorableWhitespace(ch, start, length); + } + } + + @Override + public void characters(char[] ch, int start, int length) + throws SAXException { + if(inCell) { + StringBuffer t = new StringBuffer(new String(ch,start,length)); + + // Quote if not all numbers + if(all_nums.matcher(t).matches()) + { + super.characters(ch, start, length); + } + else + { + for(int i=t.length()-1; i>=0; i--) { + if(t.charAt(i) == '\"') { + // Double up double quotes + t.insert(i, '\"'); + i--; + } + } + t.insert(0, '\"'); + t.append('\"'); + char[] c = t.toString().toCharArray(); + super.characters(c, 0, c.length); + } + } else { + super.characters(ch, start, length); + } + } + + @Override + public void startElement(String uri, String localName, String name, + Attributes atts) throws SAXException { + if(localName.equals("td")) { + inCell = true; + if(needsComma) { + super.characters(comma, 0, 1); + needsComma = true; + } + } else { + super.startElement(uri, localName, name, atts); + } + } + + @Override + public void endElement(String uri, String localName, String name) + throws SAXException { + if(localName.equals("td")) { + needsComma = true; + inCell = false; + } else { + if(localName.equals("tr")) { + needsComma = false; + } + super.endElement(uri, localName, name); + } + } + } + + protected ParseContext buildParseContext(DocumentSelector documentSelector, Boolean includeContents) + { + ParseContext context = new ParseContext(); + if (documentSelector != null) + { + context.set(DocumentSelector.class, documentSelector); + } + + // pdfParserConfig is never set in the original repo code, so code removed here. + + // If Archive transform + if (includeContents != null) + { + context.set(Parser.class, includeContents ? autoDetectParser : new EmptyParser()); + } + + return context; + } +} diff --git a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaController.java b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaController.java new file mode 100644 index 00000000..8c365f0c --- /dev/null +++ b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaController.java @@ -0,0 +1,137 @@ +/* + * #%L + * Alfresco Enterprise Repository + * %% + * Copyright (C) 2005 - 2018 Alfresco Software Limited + * %% + * License rights for this program may be obtained from Alfresco Software, Ltd. + * pursuant to a written agreement and any use of this program without such an + * agreement is prohibited. + * #L% + */ +package org.alfresco.transformer; + +import org.apache.commons.logging.LogFactory; +import org.apache.tika.exception.TikaException; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.Resource; +import org.springframework.http.ResponseEntity; +import org.springframework.stereotype.Controller; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.multipart.MultipartFile; +import org.xml.sax.SAXException; + +import javax.servlet.http.HttpServletRequest; +import java.io.File; +import java.io.IOException; + +import static org.alfresco.repo.content.MimetypeMap.MIMETYPE_TEXT_PLAIN; +import static org.alfresco.transformer.Tika.*; + +/** + * Controller for the Docker based Tika transformers. + * + * Status Codes: + * + * 200 Success + * 400 Bad Request: Invalid target mimetype <mimetype> + * 400 Bad Request: Request parameter <name> is missing (missing mandatory parameter) + * 400 Bad Request: Request parameter <name> is of the wrong type + * 400 Bad Request: Transformer exit code was not 0 (possible problem with the source file) + * 400 Bad Request: The source filename was not supplied + * 500 Internal Server Error: (no message with low level IO problems) + * 500 Internal Server Error: The target filename was not supplied (should not happen as targetExtension is checked) + * 500 Internal Server Error: Transformer version check exit code was not 0 + * 500 Internal Server Error: Transformer version check failed to create any output + * 500 Internal Server Error: Could not read the target file + * 500 Internal Server Error: The target filename was malformed (should not happen because of other checks) + * 500 Internal Server Error: Transformer failed to create an output file (the exit code was 0, so there should be some content) + * 500 Internal Server Error: Filename encoding error + * 507 Insufficient Storage: Failed to store the source file + */ +@Controller +public class TikaController extends AbstractTransformerController +{ + private Tika tika; + + @Autowired + public TikaController() throws TikaException, IOException, SAXException + { + logger = LogFactory.getLog(TikaController.class); + logger.info("--------------------------------------------------------------------------------------------------------------------------------------------------------------"); + logEnterpriseLicenseMessage(); + logger.info("Tika is from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\ 2.0.txt"); + logger.info("--------------------------------------------------------------------------------------------------------------------------------------------------------------"); + + tika = new Tika(); + } + + @Override + protected String getTransformerName() + { + return "Tika"; + } + + @Override + public void callTransform(String... args) + { + tika.transform(args); + } + + @Override + protected String version() + { + return "Tika available"; + } + + @Override + protected ProbeTestTransform getProbeTestTransform() + { + // See the Javadoc on this method and Probes.md for the choice of these values. + // the livenessPercentage is a little large as Tika does tend to suffer from slow transforms that class with a gc. + return new ProbeTestTransform(this, "quick.pdf", "quick.txt", + 60, 16, 400, 10240, 60*30+1, 60*15+20) + { + @Override + protected void executeTransformCommand(File sourceFile, File targetFile) + { + TikaController.this.callTransform(sourceFile, targetFile, PDF_BOX, + TARGET_MIMETYPE+MIMETYPE_TEXT_PLAIN, TARGET_ENCODING+"UTF-8"); + } + }; + } + + @PostMapping("/transform") + public ResponseEntity transform(HttpServletRequest request, + @RequestParam("file") MultipartFile sourceMultipartFile, + @RequestParam("targetExtension") String targetExtension, + @RequestParam("targetMimetype") String targetMimetype, + @RequestParam("targetEncoding") String targetEncoding, + + @RequestParam(value = "timeout", required = false) Long timeout, + @RequestParam(value = "testDelay", required = false) Long testDelay, + + @RequestParam(value = "transform") String transform, + @RequestParam(value="includeContents", required = false) Boolean includeContents) + { + if (!TRANSFORM_NAMES.contains(transform)) + { + throw new TransformException(400, "Invalid transform value"); + } + + String targetFilename = createTargetFileName(sourceMultipartFile, targetExtension); + File sourceFile = createSourceFile(request, sourceMultipartFile); + File targetFile = createTargetFile(request, targetFilename); + // Both files are deleted by TransformInterceptor.afterCompletion + + // TODO Consider streaming the request and response rather than using temporary files + // https://www.logicbig.com/tutorials/spring-framework/spring-web-mvc/streaming-response-body.html + + callTransform(sourceFile, targetFile, transform, + includeContents != null && includeContents ? INCLUDE_CONTENTS : null, + TARGET_MIMETYPE+targetMimetype, TARGET_ENCODING+targetEncoding); + + return createAttachment(targetFilename, targetFile, testDelay); + } +} diff --git a/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaOfficeDetectParser.java b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaOfficeDetectParser.java new file mode 100644 index 00000000..d5607bbb --- /dev/null +++ b/alfresco-docker-tika/src/main/java/org/alfresco/transformer/TikaOfficeDetectParser.java @@ -0,0 +1,117 @@ +/* + * #%L + * Alfresco Repository + * %% + * Copyright (C) 2005 - 2016 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ +package org.alfresco.transformer; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; +import java.util.HashSet; +import java.util.Set; + +import org.apache.poi.poifs.common.POIFSConstants; +import org.apache.poi.util.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.OfficeParser; +import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +///////// THIS FILE IS A COPY OF THE CODE IN alfresco-repository ///////////// + +/** + * Apache Tika assumes that + * you either know exactly what your content is, or that + * you'll leave it to auto-detection. + * Within Alfresco, we usually do know. However, from time + * to time, we don't know if we have one of the old or one + * of the new office files (eg .xls and .xlsx). + * This class allows automatically selects the appropriate + * old (OLE2) or new (OOXML) Tika parser as required. + * + * @author Nick Burch + */ +public class TikaOfficeDetectParser implements Parser { + private Parser ole2Parser = new OfficeParser(); + private Parser ooxmlParser = new OOXMLParser(); + + public Set getSupportedTypes(ParseContext parseContext) { + Set types = new HashSet(); + types.addAll(ole2Parser.getSupportedTypes(parseContext)); + types.addAll(ooxmlParser.getSupportedTypes(parseContext)); + return types; + } + + public void parse(InputStream stream, + ContentHandler handler, Metadata metadata, + ParseContext parseContext) throws IOException, SAXException, + TikaException + { + byte[] initial4 = new byte[4]; + InputStream wrapped; + // Preserve TikaInputStreams as TikaInputStreams as they require less memory to process + if (stream.markSupported()) + { + stream.mark(initial4.length); + IOUtils.readFully(stream, initial4); + stream.reset(); + wrapped = stream; + } + else + { + PushbackInputStream inp = new PushbackInputStream(stream, 4); + IOUtils.readFully(inp, initial4); + inp.unread(initial4); + wrapped = inp; + } + + // Which is it? + if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] && + initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] && + initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] && + initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3]) + { + ooxmlParser.parse(wrapped, handler, metadata, parseContext); + } + else + { + ole2Parser.parse(wrapped, handler, metadata, parseContext); + } + } + + /** + * @deprecated This method will be removed in Apache Tika 1.0. + */ + public void parse(InputStream stream, + ContentHandler handler, Metadata metadata) + throws IOException, SAXException, TikaException + { + parse(stream, handler, metadata, new ParseContext()); + } +} diff --git a/alfresco-docker-tika/src/main/resources/quick.pdf b/alfresco-docker-tika/src/main/resources/quick.pdf new file mode 100644 index 00000000..a1779afd Binary files /dev/null and b/alfresco-docker-tika/src/main/resources/quick.pdf differ diff --git a/alfresco-docker-tika/src/main/resources/templates/transformForm.html b/alfresco-docker-tika/src/main/resources/templates/transformForm.html new file mode 100644 index 00000000..5e230dbf --- /dev/null +++ b/alfresco-docker-tika/src/main/resources/templates/transformForm.html @@ -0,0 +1,39 @@ + + + +
+

Tika Test Transformations

+
+ + + + + + + + + + + +
transform *
file *
targetExtension *
targetMimetype *
targetEncoding *
includeContents (archive) *
timeout
testDelay
+
+
+ + + + + diff --git a/alfresco-docker-tika/src/main/resources/tika-config.xml b/alfresco-docker-tika/src/main/resources/tika-config.xml new file mode 100644 index 00000000..ef9f6df4 --- /dev/null +++ b/alfresco-docker-tika/src/main/resources/tika-config.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaControllerTest.java b/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaControllerTest.java new file mode 100644 index 00000000..bbf14145 --- /dev/null +++ b/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaControllerTest.java @@ -0,0 +1,344 @@ +/* + * #%L + * Alfresco Repository + * %% + * Copyright (C) 2005 - 2018 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ +package org.alfresco.transformer; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.boot.test.autoconfigure.web.servlet.WebMvcTest; +import org.springframework.boot.test.mock.mockito.SpyBean; +import org.springframework.mock.web.MockMultipartFile; +import org.springframework.test.context.junit4.SpringRunner; +import org.springframework.test.web.servlet.MvcResult; +import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilder; + +import static org.alfresco.repo.content.MimetypeMap.*; +import static org.alfresco.transformer.Tika.*; +import static org.springframework.test.util.AssertionErrors.assertTrue; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.header; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; + +/** + * Test the TikaController without a server. + * Super class includes tests for the AbstractTransformerController. + */ +@RunWith(SpringRunner.class) +@WebMvcTest(TikaController.class) +public class TikaControllerTest extends AbstractTransformerControllerTest +{ + public static final String EXPECTED_XHTML_CONTENT_CONTAINS = "

The quick brown fox jumps over the lazy dog

"; + public static final String EXPECTED_TEXT_CONTENT_CONTAINS = "The quick brown fox jumps over the lazy dog"; + public static final String EXPECTED_MSG_CONTENT_CONTAINS = "Recipients\n" + + "\tmark.rogers@alfresco.com; speedy@quick.com; mrquick@nowhere.com\n" + + "\n" + + "The quick brown fox jumps over the lazy dogs"; + public static final String EXPECTED_CSV_CONTENT_CONTAINS = "\"The\",\"quick\",\"brown\",\"fox\""; + + @SpyBean + private TikaController controller; + + String transform = PDF_BOX; + String targetEncoding = "UTF-8"; + String targetMimetype = MIMETYPE_TEXT_PLAIN; + + private void transform(String transform, String sourceExtension, String targetExtension, + String sourceMimetype, String targetMimetype, + Boolean includeContents, String expectedContentContains) throws Exception + { + // We don't use targetFileBytes as some of the transforms contain different date text based on the os being used. + super.mockTransformCommand(controller, sourceExtension, targetExtension, sourceMimetype, false); + this.transform = transform; + this.targetMimetype = targetMimetype; + + System.out.println("Test "+transform+" "+ sourceExtension +" to "+targetExtension); + MockHttpServletRequestBuilder requestBuilder = includeContents == null + ? mockMvcRequest("/transform", sourceFile, "targetExtension", this.targetExtension) + : mockMvcRequest("/transform", sourceFile, "targetExtension", this.targetExtension, "includeContents", includeContents.toString()); + MvcResult result = mockMvc.perform(requestBuilder) + .andExpect(status().is(200)) + .andExpect(header().string("Content-Disposition", "attachment; filename*= UTF-8''quick." + this.targetExtension)). + andReturn(); + String content = result.getResponse().getContentAsString(); + assertTrue("The content did not include \""+expectedContentContains, content.contains(expectedContentContains)); + } + + @Override + // Add extra required parameters to the request. + protected MockHttpServletRequestBuilder mockMvcRequest(String url, MockMultipartFile sourceFile, String... params) + { + return super.mockMvcRequest(url, sourceFile, params) + .param("transform", transform) + .param("targetEncoding", targetEncoding) + .param("targetMimetype", targetMimetype); + } + + @Test + @Override + public void simpleTransformTest() throws Exception + { + super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true); + super.simpleTransformTest(); + } + + @Test + @Override + public void testDelayTest() throws Exception + { + super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true); + super.testDelayTest(); + } + + @Test + @Override + public void badExitCodeTest() throws Exception + { + // Ignore the test in super class as the Tika transforms are real rather than mocked up. + // It is the mock that returns a non zero exit code. + } + + @Test + @Override + public void noTargetFileTest() throws Exception + { + // Ignore the test in super class as the Tika transforms are real rather than mocked up. + // It is the mock that returns a zero length file for other transformers, when we supply an invalid targetExtension. + } + + // --- Super class tests (need modified setup) --- + + @Test + @Override + public void dotDotSourceFilenameTest() throws Exception + { + super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true); + super.dotDotSourceFilenameTest(); + } + + @Test + @Override + public void noExtensionSourceFilenameTest() throws Exception + { + super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true); + super.noExtensionSourceFilenameTest(); + } + + @Test + @Override + public void badSourceFilenameTest() throws Exception + { + super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true); + super.badSourceFilenameTest(); + } + + @Test + @Override + public void blankSourceFilenameTest() throws Exception + { + super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true); + super.blankSourceFilenameTest(); + } + + @Test + @Override + public void noTargetExtensionTest() throws Exception + { + super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true); + super.noTargetExtensionTest(); + } + + @Test + @Override + public void calculateMaxTime() throws Exception + { + super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true); + super.calculateMaxTime(); + } + + // --- General Tika tests --- + + @Test + public void badEncodingTest() throws Exception + { + super.mockTransformCommand(controller, PDF, TXT, MIMETYPE_PDF, true); + targetEncoding = "rubbish"; + mockMvc.perform(mockMvcRequest("/transform", sourceFile, "targetExtension", targetExtension)) + .andExpect(status().is(500)); + } + + // --- Archive --- + + @Test + public void zipToTextArchiveTest() throws Exception + { + transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN,false, + "quick.html\n" + + "\n" + + "\n" + + "quick.pdf\n" + + "\n" + + "\n"); + } + + @Test + public void zipToTextIncludeArchiveTest() throws Exception + { + transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN,true, + "quick.html\n" + + "\n" + + "\n" + + "The quick brown fox jumps over the lazy dog\n" + + "\n" + + "\n" + + "\n" + + "quick.pdf\n" + + "\n" + + "\n" + + "The quick brown fox jumps over the lazy dog" + + "\n" + + "\n"); + } + + @Test + public void zipToTextExcludeArchiveTest() throws Exception + { + transform(ARCHIVE, ZIP, TXT, MIMETYPE_ZIP, MIMETYPE_TEXT_PLAIN, + false, "\n" + + "folder/subfolder/quick.jpg\n" + + "\n" + + "\n" + + "quick.doc\n" + + "\n" + + "\n" + + "quick.html\n" + + "\n" + + "\n" + + "quick.pdf\n" + + "\n" + + "\n" + + "quick.txt\n" + + "\n" + + "\n" + + "quick.xml\n" + + "\n"); + } + + // --- OutlookMsg --- + + @Test + public void msgToTxtOutlookMsgTest() throws Exception + { + transform(OUTLOOK_MSG, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null, EXPECTED_MSG_CONTENT_CONTAINS); + } + + // --- PdfBox --- + + @Test + public void pdfToTxtPdfBoxTest() throws Exception + { + transform(PDF_BOX, PDF, TXT, MIMETYPE_PDF, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS); + } + + @Test + public void pdfToCsvPdfBoxTest() throws Exception + { + transform(PDF_BOX, PDF, CSV, MIMETYPE_PDF, MIMETYPE_TEXT_CSV, null, EXPECTED_TEXT_CONTENT_CONTAINS); // Yes it is just text + } + + @Test + public void pdfToXmlPdfBoxTest() throws Exception + { + transform(PDF_BOX, PDF, XML, MIMETYPE_PDF, MIMETYPE_XML, null, EXPECTED_XHTML_CONTENT_CONTAINS); // Yes it is just XHTML + } + + @Test + public void pdfToXhtmlPdfBoxTest() throws Exception + { + transform(PDF_BOX, PDF, XHTML, MIMETYPE_PDF, MIMETYPE_XHTML, null, EXPECTED_XHTML_CONTENT_CONTAINS); + } + + @Test + public void pdfToHtmlPdfBoxTest() throws Exception + { + transform(PDF_BOX, PDF, HTML, MIMETYPE_PDF, MIMETYPE_HTML, null, EXPECTED_XHTML_CONTENT_CONTAINS); // Yes it is just XHTML + } + + // --- Office --- + + @Test + public void msgToTxtOfficeTest() throws Exception + { + transform(POI_OFFICE, MSG, TXT, MIMETYPE_OUTLOOK_MSG, MIMETYPE_TEXT_PLAIN, null, EXPECTED_MSG_CONTENT_CONTAINS); + } + + @Test + public void docToTxtOfficeTest() throws Exception + { + transform(POI_OFFICE, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS); + } + + // --- Poi --- + + @Test + public void xslxToCsvPoiTest() throws Exception + { + transform(POI, XSLX, CSV, MIMETYPE_OPENXML_SPREADSHEET, MIMETYPE_TEXT_CSV, null, EXPECTED_CSV_CONTENT_CONTAINS); + } + + // --- OOXML --- + + @Test + public void docxToTxtOoXmlTest() throws Exception + { + transform(POI_OO_XML, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS); + } + + @Test + public void pptxToTxtOoXmlTest() throws Exception + { + transform(POI_OO_XML, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS); + } + + // --- TikaAuto --- + + @Test + public void ppxtToTxtTikaAutoTest() throws Exception + { + transform(TIKA_AUTO, PPTX, TXT, MIMETYPE_OPENXML_PRESENTATION, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS); + } + + @Test + public void doctToTxtTikaAutoTest() throws Exception + { + transform(TIKA_AUTO, DOCX, TXT, MIMETYPE_OPENXML_WORDPROCESSING, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS); + } + + // --- TextMining --- + + @Test + public void docToTxtTextMiningTest() throws Exception + { + transform(TEXT_MINING, DOC, TXT, MIMETYPE_WORD, MIMETYPE_TEXT_PLAIN, null, EXPECTED_TEXT_CONTENT_CONTAINS); + } +} diff --git a/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaHttpRequestTest.java b/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaHttpRequestTest.java new file mode 100644 index 00000000..25547e11 --- /dev/null +++ b/alfresco-docker-tika/src/test/java/org/alfresco/transformer/TikaHttpRequestTest.java @@ -0,0 +1,51 @@ +/* + * #%L + * Alfresco Repository + * %% + * Copyright (C) 2005 - 2018 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see . + * #L% + */ +package org.alfresco.transformer; + +import org.junit.runner.RunWith; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.boot.test.context.SpringBootTest.WebEnvironment; +import org.springframework.test.context.junit4.SpringRunner; + +/** + * Tests TikaController with a server test harness. + */ +@RunWith(SpringRunner.class) +@SpringBootTest(webEnvironment = WebEnvironment.RANDOM_PORT) +public class TikaHttpRequestTest extends AbstractHttpRequestTest +{ + @Override + protected String getTransformerName() + { + return "Tika"; + } + + @Override + protected String getSourceExtension() + { + return "pdf"; + }; +} diff --git a/alfresco-docker-tika/src/test/resources/quick.doc b/alfresco-docker-tika/src/test/resources/quick.doc new file mode 100644 index 00000000..eb307fb2 Binary files /dev/null and b/alfresco-docker-tika/src/test/resources/quick.doc differ diff --git a/alfresco-docker-tika/src/test/resources/quick.docx b/alfresco-docker-tika/src/test/resources/quick.docx new file mode 100644 index 00000000..64832464 Binary files /dev/null and b/alfresco-docker-tika/src/test/resources/quick.docx differ diff --git a/alfresco-docker-tika/src/test/resources/quick.msg b/alfresco-docker-tika/src/test/resources/quick.msg new file mode 100644 index 00000000..6ee37500 Binary files /dev/null and b/alfresco-docker-tika/src/test/resources/quick.msg differ diff --git a/alfresco-docker-tika/src/test/resources/quick.pptx b/alfresco-docker-tika/src/test/resources/quick.pptx new file mode 100644 index 00000000..666b5d7e Binary files /dev/null and b/alfresco-docker-tika/src/test/resources/quick.pptx differ diff --git a/alfresco-docker-tika/src/test/resources/quick.txt b/alfresco-docker-tika/src/test/resources/quick.txt new file mode 100644 index 00000000..39deeeca --- /dev/null +++ b/alfresco-docker-tika/src/test/resources/quick.txt @@ -0,0 +1,6 @@ + +The quick brown fox jumps over the lazy dog + + + Blank Page + diff --git a/alfresco-docker-tika/src/test/resources/quick.xslx b/alfresco-docker-tika/src/test/resources/quick.xslx new file mode 100644 index 00000000..2e1f271e Binary files /dev/null and b/alfresco-docker-tika/src/test/resources/quick.xslx differ diff --git a/alfresco-docker-tika/src/test/resources/quick.zip b/alfresco-docker-tika/src/test/resources/quick.zip new file mode 100644 index 00000000..168109f4 Binary files /dev/null and b/alfresco-docker-tika/src/test/resources/quick.zip differ diff --git a/alfresco-transformer-base/src/main/java/org/alfresco/transformer/AbstractTransformerController.java b/alfresco-transformer-base/src/main/java/org/alfresco/transformer/AbstractTransformerController.java index 68dc895f..75da9fd0 100644 --- a/alfresco-transformer-base/src/main/java/org/alfresco/transformer/AbstractTransformerController.java +++ b/alfresco-transformer-base/src/main/java/org/alfresco/transformer/AbstractTransformerController.java @@ -51,8 +51,7 @@ import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.nio.file.Files; import java.nio.file.StandardCopyOption; -import java.util.Collection; -import java.util.Map; +import java.util.*; /** *

Abstract Controller, provides structure and helper methods to sub-class transformer controllers.

@@ -376,6 +375,87 @@ public abstract class AbstractTransformerController } } + public void callTransform(File sourceFile, File targetFile, String... args) throws TransformException + { + args = buildArgs(sourceFile, targetFile, args); + try + { + callTransform(args); + } + catch (IllegalArgumentException e) + { + throw new TransformException(400, getMessage(e)); + } + catch (Exception e) + { + throw new TransformException(500, getMessage(e)); + } + if (!targetFile.exists() || targetFile.length() == 0) + { + throw new TransformException(500, "Transformer failed to create an output file"); + } + } + + private String getMessage(Exception e) + { + return e.getMessage() == null ? e.getClass().getSimpleName(): e.getMessage(); + } + + protected void callTransform(String[] args) + { + // Overridden when the transform is done in the JVM rather than in an external command. + } + + protected String[] buildArgs(File sourceFile, File targetFile, String[] args) + { + ArrayList methodArgs = new ArrayList<>(args.length+2); + StringJoiner sj = new StringJoiner(" "); + for (String arg: args) + { + addArg(methodArgs, sj, arg); + } + + addFileArg(methodArgs, sj, sourceFile); + addFileArg(methodArgs, sj, targetFile); + + LogEntry.setOptions(sj.toString()); + + return methodArgs.toArray(new String[methodArgs.size()]); + } + + private void addArg(ArrayList methodArgs, StringJoiner sj, String arg) + { + if (arg != null) + { + sj.add(arg); + methodArgs.add(arg); + } + } + + private void addFileArg(ArrayList methodArgs, StringJoiner sj, File arg) + { + if (arg != null) + { + String path = arg.getAbsolutePath(); + int i = path.lastIndexOf('.'); + String ext = i == -1 ? "???" : path.substring(i+1); + sj.add(ext); + methodArgs.add(path); + } + } + + protected void executeTransformCommand(String options, File sourceFile, File targetFile, Long timeout) + { + LogEntry.setOptions(options); + + Map properties = new HashMap(5); + properties.put("options", options); + properties.put("source", sourceFile.getAbsolutePath()); + properties.put("target", targetFile.getAbsolutePath()); + + executeTransformCommand(properties, targetFile, timeout); + } + public void executeTransformCommand(Map properties, File targetFile, Long timeout) { timeout = timeout != null && timeout > 0 ? timeout : 0; diff --git a/alfresco-transformer-base/src/main/resources/application.properties b/alfresco-transformer-base/src/main/resources/application.properties index 995b06b0..ca637b84 100644 --- a/alfresco-transformer-base/src/main/resources/application.properties +++ b/alfresco-transformer-base/src/main/resources/application.properties @@ -6,4 +6,5 @@ server.port = 8090 logging.level.org.alfresco.transformer.LibreOfficeController=debug logging.level.org.alfresco.transformer.JodConverterSharedInstance=debug logging.level.org.alfresco.transformer.AlfrescoPdfRendererController=debug -logging.level.org.alfresco.transformer.ImageMagickController=debug \ No newline at end of file +logging.level.org.alfresco.transformer.ImageMagickController=debug +logging.level.org.alfresco.transformer.TikaController=debug \ No newline at end of file diff --git a/alfresco-transformer-base/src/test/java/org/alfresco/transformer/AbstractTransformerControllerTest.java b/alfresco-transformer-base/src/test/java/org/alfresco/transformer/AbstractTransformerControllerTest.java index a2b58814..5c619442 100644 --- a/alfresco-transformer-base/src/test/java/org/alfresco/transformer/AbstractTransformerControllerTest.java +++ b/alfresco-transformer-base/src/test/java/org/alfresco/transformer/AbstractTransformerControllerTest.java @@ -33,12 +33,10 @@ import org.mockito.stubbing.Answer; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.mock.web.MockMultipartFile; import org.springframework.test.web.servlet.MockMvc; +import org.springframework.test.web.servlet.request.MockHttpServletRequestBuilder; import org.springframework.test.web.servlet.request.MockMvcRequestBuilders; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; +import java.io.*; import java.net.URL; import java.nio.channels.FileChannel; import java.nio.file.Files; @@ -83,7 +81,9 @@ public abstract class AbstractTransformerControllerTest protected AbstractTransformerController controller; // Called by sub class - public void mockTransformCommand(AbstractTransformerController controller, String sourceExtension, String targetExtension, String sourceMimetype) throws IOException + public void mockTransformCommand(AbstractTransformerController controller, String sourceExtension, + String targetExtension, String sourceMimetype, + boolean readTargetFileBytes) throws IOException { this.controller = controller; this.sourceExtension = sourceExtension; @@ -92,8 +92,8 @@ public abstract class AbstractTransformerControllerTest expectedOptions = null; expectedSourceSuffix = null; - expectedSourceFileBytes = Files.readAllBytes(getTestFile("quick."+sourceExtension, true).toPath()); - expectedTargetFileBytes = Files.readAllBytes(getTestFile("quick."+targetExtension, true).toPath()); + expectedSourceFileBytes = readTestFile(sourceExtension); + expectedTargetFileBytes = readTargetFileBytes ? readTestFile(targetExtension) : null; sourceFile = new MockMultipartFile("file", "quick."+sourceExtension, sourceMimetype, expectedSourceFileBytes); controller.setTransformCommand(mockTransformCommand); @@ -159,6 +159,11 @@ public abstract class AbstractTransformerControllerTest when(mockExecutionResult.getStdOut()).thenReturn("STDOUT"); } + protected byte[] readTestFile(String extension) throws IOException + { + return Files.readAllBytes(getTestFile("quick."+extension, true).toPath()); + } + protected File getTestFile(String testFilename, boolean required) throws IOException { ClassLoader classLoader = getClass().getClassLoader(); @@ -170,12 +175,26 @@ public abstract class AbstractTransformerControllerTest return testFileUrl == null ? null : new File(testFileUrl.getFile()); } + protected MockHttpServletRequestBuilder mockMvcRequest(String url, MockMultipartFile sourceFile, String... params) + { + MockHttpServletRequestBuilder builder = MockMvcRequestBuilders.fileUpload("/transform").file(sourceFile); + + if (params.length % 2 != 0) + { + throw new IllegalArgumentException("each param should have a name and value."); + } + for (int i=0; i2.0.8 3.5.37 1.5.12.RELEASE - 7.2 + 7.3 + 8.8 3.0.1.1 1.2.3 ${project.version} @@ -27,6 +28,7 @@ alfresco-transformer-base + alfresco-docker-tika alfresco-docker-alfresco-pdf-renderer alfresco-docker-imagemagick alfresco-docker-libreoffice @@ -68,6 +70,11 @@ alfresco-core ${dependency.alfresco-core.version} + + org.alfresco + alfresco-data-model + ${dependency.alfresco-data-model.version} + org.alfresco alfresco-jodconverter-core diff --git a/scripts/testImages.sh b/scripts/testImages.sh new file mode 100755 index 00000000..4f316f13 --- /dev/null +++ b/scripts/testImages.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# For each transform project, check the live probe in each docker image works. +set -e + +docker images +echo + +transformers=`ls | grep alfresco-docker- | sed 's/alfresco-docker-\(.*\)/\1/'` +for transformer in $transformers +do + echo + echo === $transformer === + repo=`docker images | awk '{print $1}' | grep $transformer | sort -u` + echo docker run --rm -d -p 8090:8090 --name $transformer $repo:$tag + docker run --rm -d -p 8090:8090 --name $transformer $repo:$tag >/dev/null + + WAIT_INTERVAL=1 + COUNTER=0 + TIMEOUT=30 + t0=`date +%s` + echo -n "Waiting for $transformer to start " + until $(curl --output /dev/null --silent --fail http://localhost:8090/live) || [ "$COUNTER" -eq "$TIMEOUT" ]; do + printf '.' + sleep $WAIT_INTERVAL + COUNTER=$(($COUNTER+$WAIT_INTERVAL)) + done + t1=`date +%s` + delta=$(($t1 - $t0)) + + docker stop $transformer > /dev/null + + if (("$COUNTER" < "$TIMEOUT")) ; then + echo " started in $delta seconds" + else + echo " did not start after $delta seconds" + exit 1 + fi +done +echo