initial commit

This commit is contained in:
Brian Long 2022-02-24 14:49:20 -05:00
commit e9f53ab5f7
21 changed files with 1410 additions and 0 deletions

9
.gitignore vendored Normal file
View File

@ -0,0 +1,9 @@
# Eclipse
.project
.classpath
.settings
# Maven
target
pom.xml.versionsBackup

103
pom.xml Normal file
View File

@ -0,0 +1,103 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.inteligr8.alfresco</groupId>
<artifactId>pdfmeta-tengine</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>pdfmeta Alfresco T-Engine</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<ats.version>2.3.6</ats.version>
<spring-boot.version>2.3.5.RELEASE</spring-boot.version>
<docker.image.registry>docker.inteligr8.com</docker.image.registry>
<docker.image.name>inteligr8/${project.artifactId}</docker.image.name>
<docker.image.tag>${project.version}</docker.image.tag>
<ate.app.className>com.inteligr8.alfresco.pdfmeta.Application</ate.app.className>
</properties>
<dependencies>
<dependency>
<groupId>com.inteligr8.ootbee</groupId>
<artifactId>beedk-ate-springboot</artifactId>
<version>[1.0.0,2.0.0)</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>org.alfresco</groupId>
<artifactId>alfresco-transform-tika</artifactId>
<version>${ats.version}</version>
</dependency>
<dependency>
<groupId>com.inteligr8.ootbee</groupId>
<artifactId>beedk-ate-springboot-test</artifactId>
<version>[1.0.0,2.0.0)</version>
<type>pom</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>io.repaint.maven</groupId>
<artifactId>tiles-maven-plugin</artifactId>
<version>2.26</version>
<extensions>true</extensions>
<configuration>
<tiles>
<!-- Documentation: https://bitbucket.org/inteligr8/ootbee-beedk/src/stable/beedk-ate-springboot-tile -->
<tile>com.inteligr8.ootbee:beedk-ate-springboot-tile:[1.0.0,2.0.0)</tile>
<!-- Documentation: https://bitbucket.org/inteligr8/ootbee-beedk/src/stable/beedk-ate-docker-tile -->
<tile>com.inteligr8.ootbee:beedk-ate-docker-tile:[1.0.0,2.0.0)</tile>
<!-- Enables rapid application development support -->
<tile>com.inteligr8.ootbee:beedk-ate-self-rad-tile:[1.0.0,2.0.0)</tile>
</tiles>
</configuration>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>inteligr8-releases</id>
<url>http://repos.inteligr8.com/nexus/repository/inteligr8-public</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>inteligr8-snapshots</id>
<url>http://repos.inteligr8.com/nexus/repository/inteligr8-snapshots</url>
<releases>
<enabled>false</enabled>
</releases>
</repository>
<repository>
<id>alfresco-public</id>
<url>https://artifacts.alfresco.com/nexus/content/groups/public</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>inteligr8-releases</id>
<url>http://repos.inteligr8.com/nexus/repository/inteligr8-public</url>
</pluginRepository>
</pluginRepositories>
</project>

View File

@ -0,0 +1,28 @@
FROM docker.inteligr8.com/inteligr8/ubuntu-jdk:20.04-11
# Set default user information
ARG JAR_FILE
ARG APPGROUPNAME=alfresco
ARG APPGROUPID=1000
ARG APPUSERNAME=atengine
ARG APPUSERID=33001
ENV JAVA_OPTS="-Xmx128m"
ENV JAR_PATH=/usr/local/bin/${project.artifactId}.jar
COPY ${JAR_FILE} ${JAR_PATH}
# Install your engine's dependencies here
#RUN apt update && \
# apt -y install {dependency names in APT repository}
RUN groupadd -g ${APPGROUPID} ${APPGROUPNAME} && \
useradd -u ${APPUSERID} -G ${APPGROUPNAME} ${APPUSERNAME} && \
chown ${APPUSERNAME}:${APPGROUPNAME} ${JAR_PATH}
EXPOSE 8090
USER ${APPUSERNAME}
ENTRYPOINT java ${JAVA_OPTS} -jar ${JAR_PATH}

View File

@ -0,0 +1,67 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package com.inteligr8.alfresco.pdfmeta;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.actuate.autoconfigure.metrics.MeterRegistryCustomizer;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
import org.springframework.boot.context.event.ApplicationReadyEvent;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ImportResource;
import org.springframework.context.event.EventListener;
import io.micrometer.core.instrument.MeterRegistry;
@SpringBootApplication
@EnableAutoConfiguration(exclude = { DataSourceAutoConfiguration.class })
@ImportResource({"classpath*:application-context.xml"})
public class Application {
private final Logger logger = LoggerFactory.getLogger(Application.class);
@Value("${container.name}")
private String containerName;
@Bean
public MeterRegistryCustomizer<MeterRegistry> metricsCommonTags() {
return registry -> registry.config().commonTags("containerName", this.containerName);
}
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
@EventListener(ApplicationReadyEvent.class)
public void startup() {
this.logger.info("Starting application components... Done");
}
}

View File

@ -0,0 +1,37 @@
package com.inteligr8.alfresco.pdfmeta;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.inteligr8.alfresco.pdfmeta.util.DynamicDiscoveryMap;
public class AutoPropertyMappingWrapper extends DynamicDiscoveryMap<String, Set<String>> {
private final Logger logger = LoggerFactory.getLogger(this.getClass());
public AutoPropertyMappingWrapper(Map<String, Set<String>> map) {
super(map);
}
@Override
protected void discoverExtraEntry(String key) {
this.logger.trace("Looking up key as ACS property: {}", key);
int colon = key != null ? key.indexOf(':') : -1;
if (colon < 0) {
// key is not formatted like an ACS property
this.nullKeys.add(key);
} else if (key.indexOf(':', colon+1) < 0) {
logger.debug("Adding key/property to mapping: {}", key);
this.extMap.put(key, Collections.singleton(key));
} else {
// 2 colons; not formatted like an ACS property
this.nullKeys.add(key);
}
}
}

View File

@ -0,0 +1,47 @@
package com.inteligr8.alfresco.pdfmeta;
import java.io.File;
import java.util.Map;
import javax.annotation.PostConstruct;
import org.alfresco.transformer.executors.Transformer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.MediaType;
import org.springframework.stereotype.Component;
@Component
public class PdfMetaTransformer implements Transformer {
private final Logger logger = LoggerFactory.getLogger(PdfMetaTransformer.class);
private final String id = "pdfmeta";
private PdfTikaMetadataExtractor extractor;
@PostConstruct
public void init() throws Exception {
if (this.logger.isDebugEnabled())
this.logger.debug("init()");
this.extractor = new PdfTikaMetadataExtractor();
}
@Override
public String getTransformerId() {
return this.id;
}
@Override
public void extractMetadata(String transformName, String sourceMimetype, String targetMimetype, Map<String, String> transformOptions, File sourceFile, File targetFile) throws Exception {
this.logger.trace("extractMetadata({}, {}, {}, {}, '{}', '{}')", transformName, sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
if (!MediaType.APPLICATION_PDF_VALUE.equals(sourceMimetype))
throw new IllegalArgumentException();
if (transformOptions != null && !transformOptions.isEmpty())
this.logger.debug("Transform options were specified, but they will be ignored: {}", transformOptions);
this.extractor.extractMetadata(sourceMimetype, transformOptions, sourceFile, targetFile);
}
}

View File

@ -0,0 +1,207 @@
package com.inteligr8.alfresco.pdfmeta;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import javax.xml.namespace.NamespaceContext;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.dom4j.io.DOMSAXContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* This is a custom implementation that extracts more meta-data from a PDF
* than the built-in PDF extractor provided OOTB by Alfresco. Namely, it
* extracts the bookmarks, in addition to the property and text extraction
* provided OOTB.
*
* @author brian@inteligr8.com
*/
public class PdfTikaContentHandler extends DOMSAXContentHandler {
private final Logger logger = LoggerFactory.getLogger(this.getClass());
private static final XPathExpression xpathExprBodyPageText;
private static final XPathExpression xpathExprBodyUl;
private static final XPathExpression xpathExprLiText;
private static final XPathExpression xpathExprText;
private final Object parseBookmarkSync = new Object();
private List<String> texts;
private final Object parseTextSync = new Object();
private Map<String, Serializable> bookmarks;
static {
XPath xpath = XPathFactory.newInstance().newXPath();
xpath.setNamespaceContext(new TikaNamespaceContext());
try {
xpathExprBodyPageText = xpath.compile("/ns:html/ns:body/ns:div[@name=\"page\"]/text()");
xpathExprBodyUl = xpath.compile("/ns:html/ns:body/ns:ul");
xpathExprLiText = xpath.compile("ns:li/text()");
xpathExprText = xpath.compile("text()");
} catch (XPathExpressionException xpee) {
throw new ExceptionInInitializerError(xpee);
}
}
public String getTextByPage(int page) {
this.parseText();
return this.texts.get(page-1);
}
public List<String> getTextPerPages() {
this.parseText();
return this.texts;
}
private void parseText() {
synchronized (this.parseTextSync) {
if (this.texts == null) {
try {
this.texts = this.parseBodyForTexts(this.getDocument());
} catch (XPathExpressionException xpee) {
throw new IllegalStateException(xpee);
}
}
}
}
public Map<String, Serializable> getBookmarks() {
synchronized (this.parseBookmarkSync) {
if (this.bookmarks == null) {
try {
this.bookmarks = this.parseBodyForBookmarks(this.getDocument());
} catch (XPathExpressionException xpee) {
throw new IllegalStateException(xpee);
}
}
}
return this.bookmarks;
}
private List<String> parseBodyForTexts(Document document) throws XPathExpressionException {
NodeList pageTexts = (NodeList)xpathExprBodyPageText.evaluate(document, XPathConstants.NODESET);
if (pageTexts == null || pageTexts.getLength() == 0)
return null;
return this.parseTexts(pageTexts);
}
private List<String> parseTexts(NodeList pageTexts) throws XPathExpressionException {
List<String> texts = new LinkedList<>();
for (int n = 0; n < pageTexts.getLength(); n++) {
Node node = pageTexts.item(n);
texts.add(node.getTextContent());
}
return texts;
}
private Map<String, Serializable> parseBodyForBookmarks(Document document) throws XPathExpressionException {
this.logger.debug("Extracting bookmarks from the XML embedded in a PDF");
Element element = (Element)xpathExprBodyUl.evaluate(document, XPathConstants.NODE);
if (element == null)
return null;
return this.parseBookmarks(element);
}
private Map<String, Serializable> parseBookmarks(Element ulElement) throws XPathExpressionException {
Map<String, Serializable> bookmarks = new LinkedHashMap<>();
Element lastBookmarkKey = null;
Element lastBookmarkValues = null;
NodeList nodes = ulElement.getChildNodes();
this.logger.debug("Found {} XML nodes; filtering down to just bookmarks ...", nodes.getLength());
for (int n = 0; n < nodes.getLength(); n++) {
Node node = nodes.item(n);
if (node.getNodeType() == Node.ELEMENT_NODE) {
if (node.getLocalName().equals("li")) {
lastBookmarkKey = (Element)node;
} else if (node.getLocalName().equals("ul")) {
lastBookmarkValues = (Element)node;
}
if (lastBookmarkKey != null && lastBookmarkValues != null) {
this.parseBookmark(lastBookmarkKey, lastBookmarkValues, bookmarks);
lastBookmarkKey = null;
lastBookmarkValues = null;
}
}
}
return bookmarks;
}
@SuppressWarnings("unchecked")
private void parseBookmark(Element liElement, Element ulElement, Map<String, Serializable> bookmarks)
throws XPathExpressionException {
String bookmarkKey = (String)xpathExprText.evaluate(liElement, XPathConstants.STRING);
Serializable bookmarkValue = bookmarks.get(bookmarkKey);
NodeList nodes = (NodeList)xpathExprLiText.evaluate(ulElement, XPathConstants.NODESET);
for (int n = 0; n < nodes.getLength(); n++) {
Node node = nodes.item(n);
this.logger.trace("Found bookmark value: {} => {}", bookmarkKey, node.getNodeValue());
if (bookmarkValue == null) {
bookmarks.put(bookmarkKey, node.getNodeValue());
} else if (bookmarkValue instanceof List) {
((List<Serializable>)bookmarkValue).add(node.getNodeValue());
} else {
LinkedList<Serializable> bookmarkValues = new LinkedList<>();
bookmarkValues.add(bookmarkValue);
bookmarkValues.add(node.getNodeValue());
bookmarks.put(bookmarkKey, bookmarkValues);
}
}
}
private static class TikaNamespaceContext implements NamespaceContext {
private Map<String, String> prefix2uri = new HashMap<>();
private Map<String, String> uri2prefix = new HashMap<>();
public TikaNamespaceContext() {
this.prefix2uri.put("ns", "http://www.w3.org/1999/xhtml");
}
@Override
public String getNamespaceURI(String prefix) {
return this.prefix2uri.get(prefix);
}
@Override
public String getPrefix(String namespaceURI) {
return this.uri2prefix.get(namespaceURI);
}
@Override
public Iterator<String> getPrefixes(String namespaceURI) {
return Arrays.asList(this.uri2prefix.get(namespaceURI)).iterator();
}
}
}

View File

@ -0,0 +1,174 @@
package com.inteligr8.alfresco.pdfmeta;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import org.alfresco.transformer.metadataExtractors.PdfBoxMetadataExtractor;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class PdfTikaMetadataExtractor extends PdfBoxMetadataExtractor {
private final Logger logger = LoggerFactory.getLogger(this.getClass());
protected PdfTikaContentHandler newContentHandler() {
return new PdfTikaContentHandler();
}
@Override
public void mapMetadataAndWrite(File targetFile, Map<String, Serializable> metadata, Map<String, Set<String>> extractMapping) throws IOException {
super.mapMetadataAndWrite(targetFile, metadata, new AutoPropertyMappingWrapper(extractMapping));
}
/**
* This is a copy of the AbstractTikaMetadataExtractor from ATS v2.3.6.
* {@link https://github.com/Alfresco/alfresco-transform-core/blob/2.3.6/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractTikaMetadataExtractor.java}
*/
@Override
public Map<String, Serializable> extractMetadata(String sourceMimetype, Map<String, String> transformOptions, File sourceFile) throws Exception {
this.logger.trace("extractRaw({}, '{}')", sourceMimetype, sourceFile);
Map<String, Serializable> rawProperties = new HashMap<>();
InputStream istream = new FileInputStream(sourceFile);
try {
Parser parser = this.getParser();
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, sourceMimetype);
ParseContext context = this.buildParseContext(metadata, sourceMimetype);
PdfTikaContentHandler handler = this.newContentHandler();
this.logger.debug("Parsing {}b PDF using Apache Tika: {}", sourceFile.length(), sourceFile);
parser.parse(istream, handler, metadata, context);
this.logger.trace("Parsed {}b PDF using Apache Tika: {}", sourceFile.length(), sourceFile);
this.logger.debug("Parsed PDF has meta-data: {}", Arrays.asList(metadata.names()));
this.processMetadata(metadata, rawProperties);
// this is the processing of that major difference
this.processHandler(handler, rawProperties);
if (this.logger.isTraceEnabled()) {
this.logger.trace("Parsed PDF has properties: {}", rawProperties);
} else if (this.logger.isTraceEnabled()) {
this.logger.trace("Parsed PDF has properties: {}", rawProperties.keySet());
}
} finally {
istream.close();
}
return rawProperties;
}
/**
* This is a copy of the AbstractTikaMetadataExtractor from ATS v2.3.6.
* {@link https://github.com/Alfresco/alfresco-transform-core/blob/2.3.6/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractTikaMetadataExtractor.java}
*/
private ParseContext buildParseContext(Metadata metadata, String sourceMimeType) {
ParseContext context = new ParseContext();
DocumentSelector selector = this.getDocumentSelector(metadata, sourceMimeType);
if (selector != null)
context.set(DocumentSelector.class, selector);
return context;
}
/**
* This is a copy of the AbstractTikaMetadataExtractor from ATS v2.3.6.
* {@link https://github.com/Alfresco/alfresco-transform-core/blob/2.3.6/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/metadataExtractors/AbstractTikaMetadataExtractor.java}
*
* One slight difference is we are now using the Property object instead of
* the depreciated String object.
*/
protected Map<String, Serializable> processMetadata(Metadata metadata, Map<String, Serializable> rawProperties) {
for (String tikaKey : metadata.names())
this.putRawValue(tikaKey, this.getMetadataValue(metadata, tikaKey), rawProperties);
this.putRawValue(KEY_TITLE, this.getMetadataValue(metadata, TikaCoreProperties.TITLE), rawProperties);
this.putRawValue(KEY_COMMENTS, this.getMetadataValue(metadata, TikaCoreProperties.COMMENTS), rawProperties);
this.putRawValue(KEY_TAGS, this.getMetadataValue(metadata, KEY_TAGS), rawProperties);
this.putRawValue(KEY_SUBJECT, this.getMetadataValue(metadata, PDF.DOC_INFO_SUBJECT), rawProperties);
this.putRawValue(KEY_DESCRIPTION, this.getMetadataValue(metadata, TikaCoreProperties.DESCRIPTION), rawProperties);
this.putRawValue(KEY_CREATED, this.getMetadataValue(metadata, Property.composite(PDF.DOC_INFO_CREATED, new Property[] {DublinCore.CREATED, Office.CREATION_DATE, TikaCoreProperties.METADATA_DATE})), rawProperties);
this.putRawValue(KEY_AUTHOR, this.getMetadataValue(metadata, Property.composite(PDF.DOC_INFO_CREATOR, new Property[] {DublinCore.CREATOR, Office.AUTHOR})), rawProperties);
rawProperties = extractSpecific(metadata, rawProperties, null);
return rawProperties;
}
protected void processHandler(PdfTikaContentHandler handler, Map<String, Serializable> rawProperties) {
// TODO this should be more flexible than just a two-layer name/value pair
Map<String, Serializable> bookmarks = handler.getBookmarks();
if (bookmarks == null) {
this.logger.debug("Found no bookmarks in PDF");
return;
}
this.logger.debug("Found {} bookmarks in PDF", bookmarks.size());
this.logger.trace("Found bookmarks in PDF: {}", bookmarks);
rawProperties.putAll(bookmarks);
}
/**
* This method is all effectively copied/translated from Alfresco Community v6.2.
*/
private String getMetadataValue(Metadata metadata, String key) {
if (metadata.isMultiValued(key)) {
String[] parts = metadata.getValues(key);
return StringUtils.trimToNull(this.getMetadataMultiValue(parts));
} else {
return StringUtils.trimToNull(metadata.get(key));
}
}
/**
* This method is a mirror of the one above, but with a Property instead of
* String parameter.
*/
private String getMetadataValue(Metadata metadata, Property prop) {
if (metadata.isMultiValued(prop)) {
String[] parts = metadata.getValues(prop);
return StringUtils.trimToNull(this.getMetadataMultiValue(parts));
} else {
return StringUtils.trimToNull(metadata.get(prop));
}
}
/**
* This method is all effectively copied/translated from Alfresco Community v6.2.
*/
private String getMetadataMultiValue(String[] parts) {
// use Set to prevent duplicates
Set<String> value = new LinkedHashSet<>(parts.length);
for (int i = 0; i < parts.length; i++)
value.add(parts[i]);
String valueStr = value.toString();
// remove leading/trailing braces []
return valueStr.substring(1, valueStr.length() - 1);
}
}

View File

@ -0,0 +1,191 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2020 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*
* Copyright (C) 2020 - 2021 Inteligr8
*/
package com.inteligr8.alfresco.pdfmeta;
import static org.alfresco.transformer.util.RequestParamMap.FILE;
import static org.alfresco.transformer.util.RequestParamMap.SOURCE_MIMETYPE;
import static org.alfresco.transformer.util.RequestParamMap.TARGET_EXTENSION;
import static org.alfresco.transformer.util.RequestParamMap.TARGET_MIMETYPE;
import static org.alfresco.transformer.util.RequestParamMap.TEST_DELAY;
import static org.alfresco.transformer.util.RequestParamMap.TRANSFORM_NAME_PROPERTY;
import static org.springframework.http.MediaType.MULTIPART_FORM_DATA_VALUE;
import java.io.File;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.PostConstruct;
import javax.servlet.http.HttpServletRequest;
import org.alfresco.transformer.AbstractTransformerController;
import org.alfresco.transformer.probes.ProbeTestTransform;
import org.alfresco.transformer.util.MimetypeMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.client.HttpClientErrorException;
import org.springframework.web.client.HttpServerErrorException;
import org.springframework.web.multipart.MultipartFile;
/**
* Controller for the Spring Boot transformer.
*
* Status Codes:
*
* 200 Success
* 400 Bad Request: Request parameter <name> is missing (missing mandatory parameter)
* 400 Bad Request: Request parameter <name> is of the wrong type
* 400 Bad Request: Transformer exit code was not 0 (possible problem with the source file)
* 400 Bad Request: The source filename was not supplied
* 500 Internal Server Error: (no message with low level IO problems)
* 500 Internal Server Error: The target filename was not supplied (should not happen as targetExtension is checked)
* 500 Internal Server Error: Transformer version check exit code was not 0
* 500 Internal Server Error: Transformer version check failed to create any output
* 500 Internal Server Error: Could not read the target file
* 500 Internal Server Error: The target filename was malformed (should not happen because of other checks)
* 500 Internal Server Error: Transformer failed to create an output file (the exit code was 0, so there should be some content)
* 500 Internal Server Error: Filename encoding error
* 507 Insufficient Storage: Failed to store the source file
*/
@Controller
public class TransformerController extends AbstractTransformerController {
private final Logger logger = LoggerFactory.getLogger(TransformerController.class);
private final Pattern fileext = Pattern.compile("\\.([^\\.]+)$");
@Autowired
private PdfMetaTransformer transformer;
@Value("${transform.pdfmeta.version}")
private String version;
private ProbeTestTransform probe;
@Override
public String getTransformerName() {
return "pdfmeta";
}
@Override
public String version() {
return this.version;
}
@PostConstruct
public void initProbe() {
this.probe = new ProbeTestTransform(this, "quick.pdf", "quick",
7455L, 1024L, 150, 10240L, 60L * 20L + 1L, 60L * 15L - 15L) {
@Override
protected void executeTransformCommand(File sourceFile, File targetFile) {
logger.trace("getProbeTestTransform().executeTransformCommand('{}', '{}')", sourceFile, targetFile);
// FIXME
}
};
}
@Override
public ProbeTestTransform getProbeTestTransform() {
this.logger.trace("getProbeTestTransform()");
return this.probe;
}
@Override
protected String getTransformerName(final File sourceFile, final String sourceMimetype, final String targetMimetype, final Map<String, String> transformOptions) {
this.logger.trace("getTransformerName('{}', {}, {}, {})", sourceFile, sourceMimetype, targetMimetype, transformOptions);
// does not matter what value is returned, as it is not used because there is only one.
return this.getTransformerName();
}
/**
* This override of simply makes targetExtension optional
*/
@Override
@SuppressWarnings("deprecation")
@PostMapping(value = "/transform", consumes = MULTIPART_FORM_DATA_VALUE)
public ResponseEntity<Resource> transform(HttpServletRequest request,
@RequestParam(FILE) MultipartFile sourceMultipartFile,
@RequestParam(value = TARGET_EXTENSION, required = false) String targetExtension,
@RequestParam(value = SOURCE_MIMETYPE, required = false) String sourceMimetype,
@RequestParam(value = TARGET_MIMETYPE, required = false) String targetMimetype,
@RequestParam Map<String, String> requestParameters,
@RequestParam (value = TEST_DELAY, required = false) Long testDelay,
// The TRANSFORM_NAME_PROPERTY param allows ACS legacy transformers to specify which transform to use,
// It can be removed once legacy transformers are removed from ACS.
@RequestParam (value = TRANSFORM_NAME_PROPERTY, required = false) String requestTransformName) {
if (targetExtension == null)
targetExtension = "json";
return super.transform(request, sourceMultipartFile, targetExtension, sourceMimetype, targetMimetype, requestParameters, testDelay, requestTransformName);
}
@Override
public void transformImpl(String transformName, String sourceMimetype, String targetMimetype, Map<String, String> transformOptions, File sourceFile, File targetFile) {
this.logger.trace("transformImpl({}, {}, {}, {}, '{}', '{}')", transformName, sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
if (sourceMimetype == null) {
Matcher matcher = this.fileext.matcher(sourceFile.getAbsolutePath());
sourceMimetype = matcher.find() ? this.ext2mime(matcher.group(1)) : null;
}
if (targetMimetype == null) {
Matcher matcher = this.fileext.matcher(targetFile.getAbsolutePath());
targetMimetype = matcher.find() ? this.ext2mime(matcher.group(1)) : MimetypeMap.MIMETYPE_METADATA_EXTRACT;
}
try {
if (targetMimetype.equals(MimetypeMap.MIMETYPE_METADATA_EXTRACT)) {
this.transformer.extractMetadata(transformName, sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
} else {
this.transformer.transform(transformName, sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
}
} catch (IllegalArgumentException iae) {
throw new HttpClientErrorException(HttpStatus.BAD_REQUEST);
} catch (Exception e) {
throw new HttpServerErrorException(HttpStatus.INTERNAL_SERVER_ERROR);
}
}
private String ext2mime(String ext) {
switch (ext.toLowerCase()) {
// add applicable extensions here
case "pdf":
case "pdfa": return MediaType.APPLICATION_PDF_VALUE;
default: return null;
}
}
}

View File

@ -0,0 +1,132 @@
package com.inteligr8.alfresco.pdfmeta.util;
import java.lang.reflect.Array;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public abstract class DynamicDiscoveryMap<K, V> implements Map<K, V> {
private final Logger logger = LoggerFactory.getLogger(this.getClass());
private final Map<K, V> underlyingMap;
protected final Map<K, V> extMap = new HashMap<>();
protected final Set<K> nullKeys = new HashSet<>();
public DynamicDiscoveryMap(Map<K, V> map) {
this.underlyingMap = map;
}
@SuppressWarnings("unchecked")
private K castKey(Object key) {
return (K)key;
}
@Override
public boolean containsKey(Object key) {
if (this.underlyingMap.containsKey(key) ||
this.extMap.containsKey(key))
return true;
if (this.nullKeys.contains(key))
return false;
this.discoverExtraEntry(this.castKey(key));
return this.extMap.containsKey(key);
}
@Override
public boolean containsValue(Object value) {
return this.underlyingMap.containsValue(value) ||
this.extMap.containsValue(value);
}
@Override
public boolean isEmpty() {
return this.underlyingMap.isEmpty() && this.extMap.isEmpty();
}
@Override
public int size() {
return this.underlyingMap.size() + this.extMap.size();
}
@SuppressWarnings("unchecked")
@Override
public Set<K> keySet() {
Set<K>[] sets = (Set<K>[])Array.newInstance(this.underlyingMap.keySet().getClass(), 2);
sets[0] = this.underlyingMap.keySet();
sets[1] = this.extMap.keySet();
return new MultiSet<>(sets);
}
@SuppressWarnings("unchecked")
@Override
public Set<Entry<K, V>> entrySet() {
Set<Entry<K, V>>[] sets = (Set<Entry<K, V>>[])Array.newInstance(this.underlyingMap.entrySet().getClass(), 2);
sets[0] = this.underlyingMap.entrySet();
sets[1] = this.extMap.entrySet();
return new MultiSet<>(sets);
}
@SuppressWarnings("unchecked")
@Override
public Collection<V> values() {
Collection<V> values = this.underlyingMap.values();
Collection<V>[] cs = (Collection<V>[])Array.newInstance(values.getClass(), 2);
cs[0] = values;
cs[1] = this.extMap.values();
return new MultiCollection<>(cs);
}
@Override
public V get(Object key) {
if (this.underlyingMap.containsKey(key))
return this.underlyingMap.get(key);
if (!this.extMap.containsKey(key)) {
this.logger.debug("Discover possible extra key: {}", key);
this.discoverExtraEntry(this.castKey(key));
}
return this.extMap.get(key);
}
@Override
public V put(K key, V value) {
return this.underlyingMap.put(key, value);
}
@Override
public void putAll(Map<? extends K, ? extends V> m) {
this.underlyingMap.putAll(m);
}
@Override
public V remove(Object key) {
this.nullKeys.remove(key);
V values1 = this.extMap.remove(key);
V values2 = this.underlyingMap.remove(key);
return values2 != null ? values2 : values1;
}
@Override
public boolean remove(Object key, Object value) {
boolean b1 = this.underlyingMap.remove(key, value);
boolean b2 = this.extMap.remove(key, value);
return b1 || b2;
}
@Override
public void clear() {
this.underlyingMap.clear();
this.extMap.clear();
this.nullKeys.clear();
}
protected abstract void discoverExtraEntry(K key);
}

View File

@ -0,0 +1,143 @@
package com.inteligr8.alfresco.pdfmeta.util;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
public class MultiCollection<E> implements Collection<E> {
private final Collection<E>[] cs;
public MultiCollection(Collection<E>[] cs) {
this.cs = cs;
}
@Override
public boolean contains(Object o) {
for (Collection<E> list : this.cs)
if (list.contains(o))
return true;
return false;
}
@Override
public boolean containsAll(Collection<?> c) {
for (Object e : c)
if (!this.contains(e))
return false;
return true;
}
@Override
public boolean isEmpty() {
for (Collection<E> list : this.cs)
if (!list.isEmpty())
return false;
return true;
}
@Override
public int size() {
int count = 0;
for (Collection<E> list : this.cs)
count += list.size();
return count;
}
@Override
public Iterator<E> iterator() {
return new Iterator<E>() {
private Iterator<E> i = cs[0].iterator();
private int arrayIndex = 0;
@Override
public boolean hasNext() {
if (this.i == null)
return false;
if (this.i.hasNext())
return true;
while (this.arrayIndex < cs.length) {
this.i = cs[this.arrayIndex++].iterator();
if (this.i.hasNext())
return true;
}
return false;
}
@Override
public E next() {
return this.i.next();
}
@Override
public void remove() {
this.i.remove();
}
};
}
@Override
public Object[] toArray() {
Object[] array = new Object[this.size()];
int i = 0;
for (E o : this)
array[i++] = o;
return array;
}
@SuppressWarnings("unchecked")
@Override
public <T> T[] toArray(T[] a) {
int size = this.size();
if (a == null || a.length < size)
a = Arrays.copyOf(a, size);
int i = 0;
for (E o : this)
a[i++] = (T)o;
return a;
}
@Override
public boolean add(E e) {
return this.cs[0].add(e);
}
@Override
public boolean addAll(Collection<? extends E> c) {
return this.cs[0].addAll(c);
}
@Override
public boolean remove(Object o) {
for (Collection<E> set : this.cs)
if (set.remove(o))
return true;
return false;
}
@Override
public boolean removeAll(Collection<?> c) {
boolean changed = false;
for (Object e : c)
changed = this.remove(e) || changed;
return changed;
}
@Override
public boolean retainAll(Collection<?> c) {
boolean changed = false;
for (Collection<E> set : this.cs)
changed = set.retainAll(c) || changed;
return changed;
}
@Override
public void clear() {
for (Collection<E> set : this.cs)
set.clear();
}
}

View File

@ -0,0 +1,11 @@
package com.inteligr8.alfresco.pdfmeta.util;
import java.util.Set;
public class MultiSet<K> extends MultiCollection<K> implements Set<K> {
public MultiSet(Set<K>[] sets) {
super(sets);
}
}

View File

@ -0,0 +1,7 @@
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
author=cm:author
title=cm:title
subject=cm:description
created=cm:created

View File

@ -0,0 +1,11 @@
<?xml version='1.0' encoding='UTF-8'?>
<beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd">
<bean autowire-candidate="true" class="org.alfresco.transformer.clients.AlfrescoSharedFileStoreClient" />
<bean autowire-candidate="true" class="org.springframework.web.client.RestTemplate" />
<bean autowire-candidate="true" class="org.alfresco.transform.client.model.TransformRequestValidator" />
<bean autowire-candidate="true" class="org.alfresco.transformer.TransformRegistryImpl" />
</beans>

View File

@ -0,0 +1,13 @@
queue:
engineRequestQueue: ${TRANSFORM_ENGINE_REQUEST_QUEUE:com.inteligr8.alfresco.pdfmeta.acs}
transform:
core:
config:
location: classpath:this_engine_config.json
pdfmeta:
version: ${project.version}
logging:
level:
com.inteligr8.alfresco.pdfmeta: ${LOG_LEVEL:trace}

View File

@ -0,0 +1,21 @@
<html xmlns:th="http://www.thymeleaf.org">
<body>
<div>
<h2>pdfmeta Test Transformation</h2>
<form method="POST" enctype="multipart/form-data" action="/transform">
<input type="hidden" name="targetMimetype" value="alfresco-metadata-extract" />
<table>
<tr><td><div style="text-align:right">file *</div></td><td><input type="file" name="file" /></td></tr>
<!-- Add a row for each of your transform options -->
<tr><td></td><td><input type="submit" value="Transform" /></td></tr>
</table>
</form>
</div>
<div>
<a href="/log">Log entries</a>
</div>
</body>
</html>

View File

@ -0,0 +1,10 @@
{
"transformers": [
{
"transformerName": "pdfmeta",
"supportedSourceAndTargetList": [
{"sourceMediaType": "application/pdf", "priority": 5, "targetMediaType": "alfresco-metadata-extract" }
]
}
]
}

View File

@ -0,0 +1,164 @@
/*
* #%L
* Alfresco Transform Core
* %%
* Copyright (C) 2005 - 2019 Alfresco Software Limited
* %%
* This file is part of the Alfresco software.
* -
* If the software was purchased under a paid Alfresco license, the terms of
* the paid license agreement will prevail. Otherwise, the software is
* provided under the following open source license terms:
* -
* Alfresco is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* -
* Alfresco is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
* -
* You should have received a copy of the GNU Lesser General Public License
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
package com.inteligr8.alfresco.pdfmeta;
import java.util.HashSet;
import java.util.Set;
import javax.annotation.PostConstruct;
import org.alfresco.transformer.util.MimetypeMap;
import org.alfresco.transformer.util.RequestParamMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.context.SpringBootTest.WebEnvironment;
import org.springframework.boot.test.web.client.TestRestTemplate;
import org.springframework.boot.web.server.LocalServerPort;
import org.springframework.core.io.ClassPathResource;
import org.springframework.http.HttpEntity;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.test.context.junit4.SpringRunner;
import org.springframework.util.LinkedMultiValueMap;
import org.springframework.web.client.HttpStatusCodeException;
@RunWith(SpringRunner.class)
@SpringBootTest(webEnvironment = WebEnvironment.RANDOM_PORT)
public class HttpRequestIT {
private final Logger logger = LoggerFactory.getLogger(HttpRequestIT.class);
@LocalServerPort
private int port;
@Autowired
private TestRestTemplate restTemplate;
protected String baseUrl;
@PostConstruct
public void init() {
this.baseUrl = "http://localhost:" + this.port;
}
@Test
public void testRootPath() {
String result = this.restTemplate.getForObject(this.baseUrl, String.class);
if (this.logger.isDebugEnabled())
this.logger.debug("testRootPath(): result: " + result);
Assert.assertNotNull("A result from the HTTP GET was expected", result);
Document htmldoc = Jsoup.parse(result);
Assert.assertNotNull("An HTML compliant result was expected: " + result.substring(0, 50), htmldoc);
Elements elements = htmldoc.select("html body h2");
Assert.assertFalse("The HTML body is expected to have an h2 element: html: " + htmldoc.toString(), elements.isEmpty());
Assert.assertEquals("The HTML body is expected to have just one h2 element", 1, elements.size());
Assert.assertEquals("The HTML body header is not what was expected", "pdfmeta Test Transformation", elements.html());
elements = htmldoc.select("html input");
Set<String> inputs = new HashSet<String>();
for (Element element : elements)
inputs.add(element.attr("name"));
Assert.assertTrue("The HTML is expected to have a form input for 'file': " + inputs.toString(), inputs.contains("file"));
Assert.assertTrue("The HTML is expected to have a form input for 'targetExtension': " + inputs.toString(), inputs.contains("targetExtension"));
}
@Test
public void testLogPath() {
String result = this.restTemplate.getForObject(this.baseUrl + "/log", String.class);
if (this.logger.isDebugEnabled())
this.logger.debug("testLogPath(): result: " + result);
Assert.assertNotNull("A result from the HTTP GET was expected", result);
Document htmldoc = Jsoup.parse(result);
Assert.assertNotNull("An HTML compliant result was expected: " + result.substring(0, 50), htmldoc);
Elements elements = htmldoc.select("html body div h2");
Assert.assertFalse("The HTML is expected to have an html/body/div/h2 element: html: " + htmldoc.select("html").toString(), elements.isEmpty());
Assert.assertEquals("The HTML is expected to have just one html/body/div/h2 element", 1, elements.size());
Assert.assertEquals("The HTML body header is not what was expected", "pdfmeta Log Entries", elements.html());
}
@Test
public void testNoPath() {
try {
ResponseEntity<String> response = this.restTemplate.getForEntity(this.baseUrl + "/doesnotexist", String.class);
Assert.assertEquals("An unexpected path must return a 404 error", 404, response.getStatusCodeValue());
} catch (HttpStatusCodeException hsce) {
Assert.assertEquals("An unexpected path must return a 404 error", 404, hsce.getRawStatusCode());
}
}
@Test
public void testServiceGet() {
try {
ResponseEntity<String> response = this.restTemplate.getForEntity(this.baseUrl + "/transform", String.class);
Assert.assertEquals("An unexpected path must return a 405 error", 405, response.getStatusCodeValue());
} catch (HttpStatusCodeException hsce) {
Assert.assertEquals("An unexpected path must return a 405 error", 405, hsce.getRawStatusCode());
}
}
@Test
public void testServiceNoFile() {
try {
ResponseEntity<String> response = this._testService(null);
Assert.assertEquals("An unexpected path must return a 400 error", 400, response.getStatusCodeValue());
} catch (HttpStatusCodeException hsce) {
Assert.assertEquals("An unexpected path must return a 400 error", 400, hsce.getRawStatusCode());
}
}
@Test
public void testServiceQuick() {
this._testService("pdf-bookmarked.pdf");
}
protected ResponseEntity<String> _testService(String filename) {
LinkedMultiValueMap<String, Object> parameters = new LinkedMultiValueMap<>();
if (filename != null)
parameters.add(RequestParamMap.FILE, new ClassPathResource(filename));
parameters.add(RequestParamMap.TARGET_MIMETYPE, MimetypeMap.MIMETYPE_METADATA_EXTRACT);
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.MULTIPART_FORM_DATA);
HttpEntity<LinkedMultiValueMap<String, Object>> entity = new HttpEntity<>(parameters, headers);
return this.restTemplate.postForEntity(this.baseUrl + "/transform", entity, String.class);
}
}

View File

@ -0,0 +1,22 @@
package com.inteligr8.alfresco.pdfmeta;
import org.alfresco.transformer.AbstractTransformerController;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.autoconfigure.web.servlet.WebMvcTest;
import org.springframework.test.context.junit4.SpringRunner;
@RunWith(SpringRunner.class)
@WebMvcTest(controllers = TransformerController.class)
public class TransformerControllerTest {
@Autowired
protected AbstractTransformerController controller;
@Test @Ignore
public void test() {
}
}

View File

@ -0,0 +1,13 @@
queue:
engineRequestQueue: ${TRANSFORM_ENGINE_REQUEST_QUEUE:com.inteligr8.alfresco.pdfmeta.acs}
transform:
core:
config:
location: classpath:this_engine_config.json
pdfmeta:
version: ${project.version}
logging:
level:
com.inteligr8.alfresco.pdfmeta: ${LOG_LEVEL:trace}

Binary file not shown.