ACS-10169 Bump tika to 3.x (#1140)

* ACS-10169 Bump tika libraries

- to fix CVE Vulnerability CVE-2025-54988

* ACS-10169 Bump tika libraries

- adjusted MailMetadataExtractor to correctly parse e-mail send date/time by new library

* ACS-10169 Bump tika libraries

- fixed pdf loading in test

* ACS-10169 Code improvements
This commit is contained in:
Gerard Olenski
2025-09-22 10:24:57 +02:00
committed by GitHub
parent 43c1620d4d
commit 6358454423
8 changed files with 47 additions and 53 deletions

View File

@@ -44,9 +44,9 @@ import java.io.PushbackInputStream;
import java.io.Reader;
import java.net.URI;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import jakarta.annotation.PostConstruct;
import org.apache.fontbox.ttf.TrueTypeFont;
@@ -59,6 +59,7 @@ import org.apache.pdfbox.pdmodel.font.FontMapping;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.tools.TextToPDF;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -252,41 +253,24 @@ public class TextToPdfContentTransformer implements CustomTransformerFileAdaptor
private static class PagedTextToPDF extends TextToPDF
{
// REPO-1066: duplicating the following lines from org.apache.pdfbox.tools.TextToPDF because they made them private
// before the upgrade to pdfbox 2.0.8, in pdfbox 1.8, this piece of code was public in org.apache.pdfbox.pdmodel.font.PDType1Font
private static final PDType1Font DEFAULT_FONT = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
private static final Map<String, PDType1Font> STANDARD_14 = Standard14Fonts.getNames().stream()
.collect(Collectors.toMap(name -> name, name -> new PDType1Font(Standard14Fonts.getMappedFontName(name))));
private String fontName = null;
private String defaultFont = null;
PagedTextToPDF()
{
super();
setFont(DEFAULT_FONT);
}
static PDType1Font getStandardFont(String name)
{
return STANDARD_14.get(name);
}
private static final Map<String, PDType1Font> STANDARD_14 = new HashMap<>();
static
{
STANDARD_14.put(PDType1Font.TIMES_ROMAN.getBaseFont(), PDType1Font.TIMES_ROMAN);
STANDARD_14.put(PDType1Font.TIMES_BOLD.getBaseFont(), PDType1Font.TIMES_BOLD);
STANDARD_14.put(PDType1Font.TIMES_ITALIC.getBaseFont(), PDType1Font.TIMES_ITALIC);
STANDARD_14.put(PDType1Font.TIMES_BOLD_ITALIC.getBaseFont(),
PDType1Font.TIMES_BOLD_ITALIC);
STANDARD_14.put(PDType1Font.HELVETICA.getBaseFont(), PDType1Font.HELVETICA);
STANDARD_14.put(PDType1Font.HELVETICA_BOLD.getBaseFont(), PDType1Font.HELVETICA_BOLD);
STANDARD_14.put(PDType1Font.HELVETICA_OBLIQUE.getBaseFont(),
PDType1Font.HELVETICA_OBLIQUE);
STANDARD_14.put(PDType1Font.HELVETICA_BOLD_OBLIQUE.getBaseFont(),
PDType1Font.HELVETICA_BOLD_OBLIQUE);
STANDARD_14.put(PDType1Font.COURIER.getBaseFont(), PDType1Font.COURIER);
STANDARD_14.put(PDType1Font.COURIER_BOLD.getBaseFont(), PDType1Font.COURIER_BOLD);
STANDARD_14.put(PDType1Font.COURIER_OBLIQUE.getBaseFont(), PDType1Font.COURIER_OBLIQUE);
STANDARD_14.put(PDType1Font.COURIER_BOLD_OBLIQUE.getBaseFont(),
PDType1Font.COURIER_BOLD_OBLIQUE);
STANDARD_14.put(PDType1Font.SYMBOL.getBaseFont(), PDType1Font.SYMBOL);
STANDARD_14.put(PDType1Font.ZAPF_DINGBATS.getBaseFont(), PDType1Font.ZAPF_DINGBATS);
}
// duplicating until here
private String fontName = null;
private String defaultFont = null;
// The following code is based on the code in TextToPDF with the addition of
// checks for page limits.
// The calling code must close the PDDocument once finished with it.
@@ -369,16 +353,16 @@ public class TextToPdfContentTransformer implements CustomTransformerFileAdaptor
contentStream.setFont(font, fontSize);
contentStream.beginText();
y = page.getMediaBox().getHeight() - margin + height;
contentStream.moveTextPositionByAmount(margin, y);
contentStream.newLineAtOffset(margin, y);
}
if (contentStream == null)
{
throw new IOException("Error:Expected non-null content stream.");
}
contentStream.moveTextPositionByAmount(0, -height);
contentStream.newLineAtOffset(0, -height);
y -= height;
contentStream.drawString(nextLineToDraw.toString());
contentStream.showText(nextLineToDraw.toString());
}
}

View File

@@ -50,6 +50,7 @@ import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.nio.file.Files;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.junit.jupiter.api.BeforeEach;
@@ -388,7 +389,7 @@ public class MiscTest extends AbstractBaseTest
expected.getBytes());
// Read back in the PDF and check it
PDDocument doc = PDDocument.load(result.getResponse().getContentAsByteArray());
PDDocument doc = Loader.loadPDF(result.getResponse().getContentAsByteArray());
PDFTextStripper textStripper = new PDFTextStripper();
StringWriter textWriter = new StringWriter();
textStripper.writeText(doc, textWriter);

View File

@@ -67,6 +67,7 @@ import java.util.Map;
import java.util.Objects;
import java.util.stream.Stream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;
@@ -178,7 +179,7 @@ public class MiscTransformsIT
if (MIMETYPE_PDF.equals(targetMimetype))
{
// verify if PDF isn't corrupted
final PDDocument pdfFile = PDDocument.load(Objects.requireNonNull(response.getBody()).getInputStream());
final PDDocument pdfFile = Loader.loadPDF(Objects.requireNonNull(response.getBody()).getContentAsByteArray());
assertNotNull(pdfFile);
}
}

View File

@@ -57,6 +57,7 @@ import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.stream.ImageInputStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.junit.jupiter.api.Assertions;
@@ -144,7 +145,7 @@ class ImageToPdfTransformerTest
transformer.transform(imageFile.mimetype, MIMETYPE_PDF, transformOptions.toMap(), sourceFile, targetFile, transformManager);
then(transformManager).shouldHaveNoInteractions();
try (PDDocument actualPdfDocument = PDDocument.load(targetFile))
try (PDDocument actualPdfDocument = Loader.loadPDF(targetFile))
{
int expectedNumberOfPages = calculateExpectedNumberOfPages(transformOptions, imageFile.firstPage(), imageFile.lastPage());
assertNotNull(actualPdfDocument);
@@ -245,7 +246,7 @@ class ImageToPdfTransformerTest
// when
transformer.transform(MIMETYPE_IMAGE_TIFF, MIMETYPE_PDF, transformOptions.toMap(), sourceFile, targetFile, transformManager);
try (PDDocument actualPdfDocument = PDDocument.load(targetFile))
try (PDDocument actualPdfDocument = Loader.loadPDF(targetFile))
{
PDRectangle finalExpectedPdfFormat = expectedPdfFormatRotator.apply(expectedPdfFormat.getWidth(), expectedPdfFormat.getHeight());
assertNotNull(actualPdfDocument);
@@ -262,7 +263,7 @@ class ImageToPdfTransformerTest
// when
transformer.transform(MIMETYPE_IMAGE_TIFF, MIMETYPE_PDF, transformOptions.toMap(), sourceFile, targetFile, transformManager);
try (PDDocument actualPdfDocument = PDDocument.load(targetFile))
try (PDDocument actualPdfDocument = Loader.loadPDF(targetFile))
{
BufferedImage actualImage = ImageIO.read(sourceFile);
assertNotNull(actualPdfDocument);
@@ -279,7 +280,7 @@ class ImageToPdfTransformerTest
// when
transformer.transform(MIMETYPE_IMAGE_TIFF, MIMETYPE_PDF, transformOptions.toMap(), sourceFile, targetFile, transformManager);
try (PDDocument actualPdfDocument = PDDocument.load(targetFile))
try (PDDocument actualPdfDocument = Loader.loadPDF(targetFile))
{
BufferedImage actualImage = ImageIO.read(sourceFile);
assertNotNull(actualPdfDocument);
@@ -314,7 +315,7 @@ class ImageToPdfTransformerTest
// when
transformer.transform(imageFile.mimetype, MIMETYPE_PDF, transformOptions.toMap(), source, targetFile, transformManager);
try (PDDocument actualPdfDocument = PDDocument.load(targetFile))
try (PDDocument actualPdfDocument = Loader.loadPDF(targetFile))
{
assertNotNull(actualPdfDocument);
assertEquals(expectedWidth, actualPdfDocument.getPage(0).getMediaBox().getWidth(), "Pdf width");

View File

@@ -48,8 +48,9 @@ import java.io.StringWriter;
import java.util.HashMap;
import java.util.Map;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.text.PDFTextStripper;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -171,13 +172,13 @@ public class TextToPdfContentTransformerTest
writeToFile(sourceFile, TEXT_WITHOUT_A_BREVE, encoding, null, null);
Map<String, String> parameters = new HashMap<>();
parameters.put(PDF_FONT, PDType1Font.TIMES_BOLD.getName());
parameters.put(PDF_FONT, Standard14Fonts.FontName.TIMES_BOLD.getName());
parameters.put(PDF_FONT_SIZE, "30");
TransformCheckResult result = transformTextAndCheck(sourceFile, encoding, TEXT_WITHOUT_A_BREVE, String.valueOf(-1), true,
parameters, false);
assertEquals(result.getUsedFont(), PDType1Font.TIMES_BOLD.getName());
assertEquals(result.getUsedFont(), Standard14Fonts.FontName.TIMES_BOLD.getName());
assertNull(result.getErrorMessage());
}
@@ -200,7 +201,7 @@ public class TextToPdfContentTransformerTest
TransformCheckResult result = transformTextAndCheck(sourceFile, encoding, TEXT_WITHOUT_A_BREVE, String.valueOf(-1), true,
parameters, false);
assertEquals(result.getUsedFont(), PDType1Font.TIMES_ROMAN.getName());
assertEquals(result.getUsedFont(), Standard14Fonts.FontName.TIMES_ROMAN.getName());
assertNull(result.getErrorMessage());
}
@@ -218,14 +219,14 @@ public class TextToPdfContentTransformerTest
writeToFile(sourceFile, TEXT_WITH_A_BREVE, encoding, null, null);
Map<String, String> parameters = new HashMap<>();
parameters.put(PDF_FONT, PDType1Font.TIMES_BOLD.getName());
parameters.put(PDF_FONT, Standard14Fonts.FontName.TIMES_BOLD.getName());
TransformCheckResult result = transformTextAndCheck(sourceFile, encoding, TEXT_WITH_A_BREVE, String.valueOf(-1), true,
parameters, true);
assertEquals(result.getUsedFont(), PDType1Font.TIMES_BOLD.getName());
assertEquals(result.getUsedFont(), Standard14Fonts.FontName.TIMES_BOLD.getName());
assertNotNull(result.getErrorMessage());
assertTrue(result.getErrorMessage().contains(PDType1Font.TIMES_BOLD.getName()));
assertTrue(result.getErrorMessage().contains(Standard14Fonts.FontName.TIMES_BOLD.getName()));
}
/**
@@ -326,7 +327,7 @@ public class TextToPdfContentTransformerTest
if (!failed)
{
// Read back in the PDF and check it
PDDocument doc = PDDocument.load(targetFile);
PDDocument doc = Loader.loadPDF(targetFile);
PDFTextStripper textStripper = new PDFTextStripper();
StringWriter textWriter = new StringWriter();
textStripper.writeText(doc, textWriter);

View File

@@ -96,7 +96,7 @@ public class MailMetadataExtractor extends AbstractTikaMetadataExtractorEmbeddor
putRawValue(KEY_ORIGINATOR, metadata.get(TikaCoreProperties.CREATOR), properties);
putRawValue(KEY_SUBJECT, metadata.get(TikaCoreProperties.TITLE), properties);
putRawValue(KEY_DESCRIPTION, metadata.get(TikaCoreProperties.SUBJECT), properties);
putRawValue(KEY_SENT_DATE, metadata.get(TikaCoreProperties.MODIFIED), properties);
putRawValue(KEY_SENT_DATE, metadata.get(TikaCoreProperties.CREATED), properties);
// Store the TO, but not cc/bcc in the addressee field
putRawValue(KEY_ADDRESSEE, metadata.get(Message.MESSAGE_TO), properties);

View File

@@ -197,7 +197,7 @@ public class ExifToolParser extends ExternalParser
TemporaryResources tmp = new TemporaryResources();
try
{
TikaInputStream tis = TikaInputStream.get(stream, tmp);
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
if (this.getSupportedTypes().contains(mediaType))
{

10
pom.xml
View File

@@ -21,11 +21,11 @@
<acs-compatible.java.version>11</acs-compatible.java.version>
<image.tag>latest</image.tag>
<image.registry>quay.io</image.registry>
<dependency.pdfbox.version>2.0.30</dependency.pdfbox.version>
<dependency.pdfbox.version>3.0.5</dependency.pdfbox.version>
<dependency.alfresco-jodconverter-core.version>3.0.1.20</dependency.alfresco-jodconverter-core.version>
<env.project_version>${project.version}</env.project_version>
<dependency.jackson.version>2.19.2</dependency.jackson.version>
<dependency.tika.version>2.9.2</dependency.tika.version>
<dependency.tika.version>3.2.3</dependency.tika.version>
<dependency.poi.version>5.4.1</dependency.poi.version>
<dependency.commons-io.version>2.20.0</dependency.commons-io.version>
<dependency.imaging.version>1.0.0-alpha6</dependency.imaging.version>
@@ -135,6 +135,12 @@
<dependencyManagement>
<dependencies>
<!-- v1.10 has 0BSD license it must be consulted with Legal -->
<dependency>
<groupId>org.tukaani</groupId>
<artifactId>xz</artifactId>
<version>1.9</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>