mirror of
https://github.com/Alfresco/alfresco-transform-core.git
synced 2025-10-01 14:41:17 +00:00
REPO-4334 Move metadata extraction into T-Engines (#247)
* Metadata extract code added to T-Engines * Required a refactor of duplicate code to avoid 3x more duplication: - try catches used to return return exit codes - calls to java libraries or commands to external processes - building of transform options in controllers, adaptors * integration tests based on current extracts performed in the repo * included extract code for libreoffice, and embed code even though not used out of the box any more. There may well be custom extracts using them that move to T-Engines * removal of unused imports * minor autoOrient / allowEnlargement bug fixes that were not included in Paddington on the T-Engine side.
This commit is contained in:
@@ -26,6 +26,7 @@
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import io.micrometer.core.instrument.MeterRegistry;
|
||||
import org.alfresco.transformer.transformers.SelectingTransformer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@@ -39,8 +40,6 @@ import org.springframework.boot.context.event.ApplicationReadyEvent;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.event.EventListener;
|
||||
|
||||
import io.micrometer.core.instrument.MeterRegistry;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import static org.alfresco.transformer.logging.StandardMessages.LICENCE;
|
||||
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -26,34 +26,20 @@
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
import static org.alfresco.transformer.fs.FileManager.createAttachment;
|
||||
import static org.alfresco.transformer.fs.FileManager.createSourceFile;
|
||||
import static org.alfresco.transformer.fs.FileManager.createTargetFile;
|
||||
import static org.alfresco.transformer.fs.FileManager.createTargetFileName;
|
||||
import static org.alfresco.transformer.transformers.HtmlParserContentTransformer.SOURCE_ENCODING;
|
||||
import static org.springframework.http.HttpStatus.OK;
|
||||
import static org.springframework.http.MediaType.MULTIPART_FORM_DATA_VALUE;
|
||||
import org.alfresco.transformer.probes.ProbeTestTransform;
|
||||
import org.alfresco.transformer.transformers.SelectingTransformer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Controller;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
|
||||
import org.alfresco.transformer.logging.LogEntry;
|
||||
import org.alfresco.transformer.probes.ProbeTestTransform;
|
||||
import org.alfresco.transformer.transformers.SelectingTransformer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.Resource;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.stereotype.Controller;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
import static org.alfresco.transformer.transformers.HtmlParserContentTransformer.SOURCE_ENCODING;
|
||||
import static org.alfresco.transformer.util.RequestParamMap.TRANSFORM_NAME_PARAMETER;
|
||||
|
||||
@Controller
|
||||
public class MiscController extends AbstractTransformerController
|
||||
@@ -88,71 +74,16 @@ public class MiscController extends AbstractTransformerController
|
||||
{
|
||||
Map<String, String> parameters = new HashMap<>();
|
||||
parameters.put(SOURCE_ENCODING, "UTF-8");
|
||||
transformer.transform("html", sourceFile, targetFile, MIMETYPE_HTML,
|
||||
MIMETYPE_TEXT_PLAIN, parameters);
|
||||
transform("html", MIMETYPE_HTML, MIMETYPE_TEXT_PLAIN, parameters, sourceFile, targetFile);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processTransform(final File sourceFile, final File targetFile,
|
||||
final String sourceMimetype, final String targetMimetype,
|
||||
final Map<String, String> transformOptions, final Long timeout)
|
||||
protected void transform(String transformName, String sourceMimetype, String targetMimetype,
|
||||
Map<String, String> transformOptions, File sourceFile, File targetFile)
|
||||
{
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug(
|
||||
"Processing request with: sourceFile '{}', targetFile '{}', transformOptions" +
|
||||
" '{}', timeout {} ms", sourceFile, targetFile, transformOptions, timeout);
|
||||
}
|
||||
|
||||
final String transform = getTransformerName(sourceFile, sourceMimetype, targetMimetype,
|
||||
transformOptions);
|
||||
transformer.transform(transform, sourceFile, targetFile, sourceMimetype, targetMimetype,
|
||||
transformOptions);
|
||||
}
|
||||
|
||||
@PostMapping(value = "/transform", consumes = MULTIPART_FORM_DATA_VALUE)
|
||||
public ResponseEntity<Resource> transform(HttpServletRequest request,
|
||||
@RequestParam("file") MultipartFile sourceMultipartFile,
|
||||
@RequestParam("targetExtension") String targetExtension,
|
||||
@RequestParam("targetMimetype") String targetMimetype,
|
||||
@RequestParam(value = "targetEncoding", required = false) String targetEncoding,
|
||||
@RequestParam("sourceMimetype") String sourceMimetype,
|
||||
@RequestParam(value = "sourceEncoding", required = false) String sourceEncoding,
|
||||
@RequestParam(value = "pageLimit", required = false) String pageLimit,
|
||||
@RequestParam(value = "testDelay", required = false) Long testDelay)
|
||||
{
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
logger.debug(
|
||||
"Processing request with: sourceMimetype '{}', sourceEncoding '{}', " +
|
||||
"targetMimetype '{}', targetExtension '{}', targetEncoding '{}', pageLimit '{}'",
|
||||
sourceMimetype, sourceEncoding, targetMimetype, targetExtension, targetEncoding,
|
||||
pageLimit);
|
||||
}
|
||||
|
||||
final String targetFilename = createTargetFileName(
|
||||
sourceMultipartFile.getOriginalFilename(), targetExtension);
|
||||
getProbeTestTransform().incrementTransformerCount();
|
||||
final File sourceFile = createSourceFile(request, sourceMultipartFile);
|
||||
final File targetFile = createTargetFile(request, targetFilename);
|
||||
|
||||
final Map<String, String> transformOptions = createTransformOptions(
|
||||
"sourceEncoding", sourceEncoding,
|
||||
"targetEncoding", targetEncoding,
|
||||
"pageLimit", pageLimit);
|
||||
|
||||
final String transform = getTransformerName(sourceFile, sourceMimetype, targetMimetype,
|
||||
transformOptions);
|
||||
transformer.transform(transform, sourceFile, targetFile, sourceMimetype, targetMimetype,
|
||||
transformOptions);
|
||||
|
||||
final ResponseEntity<Resource> body = createAttachment(targetFilename, targetFile);
|
||||
LogEntry.setTargetSize(targetFile.length());
|
||||
long time = LogEntry.setStatusCodeAndMessage(OK.value(), "Success");
|
||||
time += LogEntry.addDelay(testDelay);
|
||||
getProbeTestTransform().recordTransformTime(time);
|
||||
return body;
|
||||
transformOptions.put(TRANSFORM_NAME_PARAMETER, transformName);
|
||||
transformer.transform(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
|
||||
}
|
||||
}
|
||||
|
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer;
|
||||
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.Parameterized;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static java.util.stream.Collectors.toList;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_RFC822;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_XHTML;
|
||||
import static org.alfresco.transformer.TestFileInfo.testFile;
|
||||
|
||||
/**
|
||||
* Metadata integration tests in the Misc T-Engine.
|
||||
*
|
||||
* @author adavis
|
||||
*/
|
||||
@RunWith(Parameterized.class)
|
||||
public class MiscMetadataExtractsIT extends AbstractMetadataExtractsIT
|
||||
{
|
||||
public MiscMetadataExtractsIT(TestFileInfo testFileInfo)
|
||||
{
|
||||
super(testFileInfo);
|
||||
}
|
||||
|
||||
@Parameterized.Parameters
|
||||
public static List<TestFileInfo> engineTransformations()
|
||||
{
|
||||
return Stream.of(
|
||||
// HtmlMetadataExtractor
|
||||
testFile(MIMETYPE_HTML, "html", "quick.html"),
|
||||
testFile(MIMETYPE_XHTML, "xhtml", "quick.xhtml.alf"), // avoid the license header check on xhtml
|
||||
|
||||
// RFC822MetadataExtractor
|
||||
testFile(MIMETYPE_RFC822, "eml", "quick.eml"),
|
||||
|
||||
// Special test cases from the repo tests
|
||||
// ======================================
|
||||
testFile(MIMETYPE_RFC822, "eml", "quick.spanish.eml"),
|
||||
testFile(MIMETYPE_HTML, "html", "quick.japanese.html")
|
||||
|
||||
).collect(toList());
|
||||
}
|
||||
}
|
@@ -69,6 +69,23 @@
|
||||
],
|
||||
"transformOptions": [
|
||||
]
|
||||
},
|
||||
{
|
||||
"transformerName": "HtmlMetadataExtractor",
|
||||
"supportedSourceAndTargetList": [
|
||||
{"sourceMediaType": "text/html", "targetMediaType": "alfresco-metadata-extract"},
|
||||
{"sourceMediaType": "application/xhtml+xml", "targetMediaType": "alfresco-metadata-extract"}
|
||||
],
|
||||
"transformOptions": [
|
||||
]
|
||||
},
|
||||
{
|
||||
"transformerName": "RFC822MetadataExtractor",
|
||||
"supportedSourceAndTargetList": [
|
||||
{"sourceMediaType": "message/rfc822", "targetMediaType": "alfresco-metadata-extract"}
|
||||
],
|
||||
"transformOptions": [
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@@ -1,5 +1,5 @@
|
||||
From: Nevin Nollop <nevin.nollop@alfresco.com>
|
||||
To: Nevin Nollop <nevin.nollop@alfresco.com>
|
||||
To: Nevin Nollop <nevin.nollop@gmail.com>
|
||||
Cc: Nevin Nollop <nevinn@alfresco.com>
|
||||
Message-ID: <20040604122322.GV1905@phoenix.home>
|
||||
Date: Fri, 4 Jun 2004 14:23:22 +0200
|
||||
|
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}addressee" : "Nevin Nollop <nevin.nollop@gmail.com>",
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://www.alfresco.org/model/content/1.0}addressees" : "Nevin Nollop <nevinn@alfresco.com>",
|
||||
"{http://www.alfresco.org/model/imap/1.0}dateSent" : 1086351802000,
|
||||
"{http://www.alfresco.org/model/imap/1.0}messageTo" : "Nevin Nollop <nevin.nollop@gmail.com>",
|
||||
"{http://www.alfresco.org/model/imap/1.0}messageId" : "<20040604122322.GV1905@phoenix.home>",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://www.alfresco.org/model/imap/1.0}messageSubject" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://www.alfresco.org/model/imap/1.0}messageCc" : "Nevin Nollop <nevinn@alfresco.com>",
|
||||
"{http://www.alfresco.org/model/content/1.0}sentdate" : 1086351802000,
|
||||
"{http://www.alfresco.org/model/content/1.0}subjectline" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://www.alfresco.org/model/imap/1.0}messageFrom" : "Nevin Nollop <nevin.nollop@alfresco.com>",
|
||||
"{http://www.alfresco.org/model/content/1.0}originator" : "Nevin Nollop <nevin.nollop@alfresco.com>"
|
||||
}
|
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}author": "Nevin Nollop",
|
||||
"{http://www.alfresco.org/model/content/1.0}description": "Gym class featuring a brown fox and lazy dog",
|
||||
"{http://www.alfresco.org/model/content/1.0}title": "The quick brown fox jumps over the lazy dog"
|
||||
}
|
@@ -0,0 +1,12 @@
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=Shift_JIS">
|
||||
<title><EFBFBD>m<EFBFBD>F<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʂ<EFBFBD><EFBFBD>Y<EFBFBD>t<EFBFBD><EFBFBD><EFBFBD>܂<EFBFBD><EFBFBD>̂ŁA<EFBFBD>m<EFBFBD>F<EFBFBD><EFBFBD><EFBFBD>Ă<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD></title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : "確認した結果を添付しますので、確認してください"
|
||||
}
|
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/imap/1.0}dateReceived" : "Thu, 16 Aug 2012 08:13:29 -0700 (PDT)",
|
||||
"{http://www.alfresco.org/model/content/1.0}addressee" : "jane.doe@alfresco.com",
|
||||
"{http://www.alfresco.org/model/content/1.0}description" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://www.alfresco.org/model/content/1.0}addressees" : null,
|
||||
"{http://www.alfresco.org/model/imap/1.0}dateSent" : 1345130009000,
|
||||
"{http://www.alfresco.org/model/imap/1.0}messageTo" : "jane.doe@alfresco.com",
|
||||
"{http://www.alfresco.org/model/imap/1.0}messageId" : "<CAL0uq1f9vPczLRinL3xB5U_oSSd5U0ob=408nBgosCY0OVFyBw@mail.alfresco.com>",
|
||||
"{http://www.alfresco.org/model/content/1.0}title" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://www.alfresco.org/model/imap/1.0}messageSubject" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://www.alfresco.org/model/imap/1.0}messageCc" : null,
|
||||
"{http://www.alfresco.org/model/content/1.0}sentdate" : 1345130009000,
|
||||
"{http://www.alfresco.org/model/content/1.0}subjectline" : "The quick brown fox jumps over the lazy dog",
|
||||
"{http://www.alfresco.org/model/imap/1.0}messageFrom" : "john.doe@alfresco.com",
|
||||
"{http://www.alfresco.org/model/content/1.0}originator" : "john.doe@alfresco.com"
|
||||
}
|
@@ -0,0 +1,17 @@
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=windows-1252"/>
|
||||
<title>The quick brown fox jumps over the lazy dog</title>
|
||||
<meta name="author" content="Nevin Nollop"/>
|
||||
<meta name="keywords" content="Pangram, fox, dog"/>
|
||||
<meta name="description" content="Gym class featuring a brown fox and lazy dog"/>
|
||||
</head>
|
||||
|
||||
<body lang="EN-US">
|
||||
|
||||
The quick brown fox jumps over the lazy dog
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"{http://www.alfresco.org/model/content/1.0}author": "Nevin Nollop",
|
||||
"{http://www.alfresco.org/model/content/1.0}description": "Gym class featuring a brown fox and lazy dog",
|
||||
"{http://www.alfresco.org/model/content/1.0}title": "The quick brown fox jumps over the lazy dog"
|
||||
}
|
@@ -0,0 +1,203 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005-2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
|
||||
import org.alfresco.transformer.transformers.SelectableTransformer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.swing.text.ChangedCharSetException;
|
||||
import javax.swing.text.MutableAttributeSet;
|
||||
import javax.swing.text.html.HTML;
|
||||
import javax.swing.text.html.HTMLEditorKit;
|
||||
import javax.swing.text.html.parser.ParserDelegator;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Metadata extractor for HTML and XHTML.
|
||||
*
|
||||
* Configuration: (see HtmlMetadataExtractor_metadata_extract.properties and misc_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>author:</b> -- cm:author
|
||||
* <b>title:</b> -- cm:title
|
||||
* <b>description:</b> -- cm:description
|
||||
* </pre>
|
||||
*
|
||||
* Based on HtmlMetadataExtracter from the content repository.
|
||||
*
|
||||
* @author Jesper Steen Møller
|
||||
* @author Derek Hulley
|
||||
* @author adavis
|
||||
*/
|
||||
public class HtmlMetadataExtractor extends AbstractMetadataExtractor implements SelectableTransformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(HtmlMetadataExtractor.class);
|
||||
|
||||
private static final String KEY_AUTHOR = "author";
|
||||
private static final String KEY_TITLE = "title";
|
||||
private static final String KEY_DESCRIPTION= "description";
|
||||
|
||||
public HtmlMetadataExtractor()
|
||||
{
|
||||
super(logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractMetadata(String sourceMimetype, String targetMimetype, Map<String, String> transformOptions,
|
||||
File sourceFile, File targetFile) throws Exception
|
||||
{
|
||||
Map<String, Serializable> metadata = extractMetadata(sourceMimetype, transformOptions, sourceFile);
|
||||
mapMetadataAndWrite(targetFile, metadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Serializable> extractMetadata(String sourceMimetype, Map<String, String> transformOptions,
|
||||
File sourceFile) throws Exception
|
||||
{
|
||||
final Map<String, Serializable> rawProperties = new HashMap<>();
|
||||
|
||||
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback()
|
||||
{
|
||||
StringBuffer title = null;
|
||||
boolean inHead = false;
|
||||
|
||||
public void handleText(char[] data, int pos)
|
||||
{
|
||||
if (title != null)
|
||||
{
|
||||
title.append(data);
|
||||
}
|
||||
}
|
||||
|
||||
public void handleComment(char[] data, int pos)
|
||||
{
|
||||
// Perhaps sniff for Office 9+ metadata in here?
|
||||
}
|
||||
|
||||
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos)
|
||||
{
|
||||
if (HTML.Tag.HEAD.equals(t))
|
||||
{
|
||||
inHead = true;
|
||||
}
|
||||
else if (HTML.Tag.TITLE.equals(t) && inHead)
|
||||
{
|
||||
title = new StringBuffer();
|
||||
}
|
||||
else
|
||||
{
|
||||
handleSimpleTag(t, a, pos);
|
||||
}
|
||||
}
|
||||
|
||||
public void handleEndTag(HTML.Tag t, int pos)
|
||||
{
|
||||
if (HTML.Tag.HEAD.equals(t))
|
||||
{
|
||||
inHead = false;
|
||||
}
|
||||
else if (HTML.Tag.TITLE.equals(t) && title != null)
|
||||
{
|
||||
putRawValue(KEY_TITLE, title.toString(), rawProperties);
|
||||
title = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos)
|
||||
{
|
||||
if (HTML.Tag.META.equals(t))
|
||||
{
|
||||
Object nameO = a.getAttribute(HTML.Attribute.NAME);
|
||||
Object valueO = a.getAttribute(HTML.Attribute.CONTENT);
|
||||
if (nameO == null || valueO == null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
String name = nameO.toString();
|
||||
|
||||
if (name.equalsIgnoreCase("creator") || name.equalsIgnoreCase("author")
|
||||
|| name.equalsIgnoreCase("dc.creator"))
|
||||
{
|
||||
putRawValue(KEY_AUTHOR, valueO.toString(), rawProperties);
|
||||
}
|
||||
else if (name.equalsIgnoreCase("description") || name.equalsIgnoreCase("dc.description"))
|
||||
{
|
||||
putRawValue(KEY_DESCRIPTION, valueO.toString(), rawProperties);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void handleError(String errorMsg, int pos)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
String charsetGuess = "UTF-8";
|
||||
int tries = 0;
|
||||
while (tries < 3)
|
||||
{
|
||||
rawProperties.clear();
|
||||
Reader r = null;
|
||||
|
||||
try (InputStream cis = new FileInputStream(sourceFile))
|
||||
{
|
||||
// TODO: for now, use default charset; we should attempt to map from html meta-data
|
||||
r = new InputStreamReader(cis, charsetGuess);
|
||||
HTMLEditorKit.Parser parser = new ParserDelegator();
|
||||
parser.parse(r, callback, tries > 0);
|
||||
break;
|
||||
}
|
||||
catch (ChangedCharSetException ccse)
|
||||
{
|
||||
tries++;
|
||||
charsetGuess = ccse.getCharSetSpec();
|
||||
int begin = charsetGuess.indexOf("charset=");
|
||||
if (begin > 0)
|
||||
{
|
||||
charsetGuess = charsetGuess.substring(begin + 8, charsetGuess.length());
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (r != null)
|
||||
{
|
||||
r.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rawProperties;
|
||||
}
|
||||
}
|
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005-2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
* If the software was purchased under a paid Alfresco license, the terms of
|
||||
* the paid license agreement will prevail. Otherwise, the software is
|
||||
* provided under the following open source license terms:
|
||||
* -
|
||||
* Alfresco is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Lesser General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
* -
|
||||
* Alfresco is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Lesser General Public License for more details.
|
||||
* -
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with Alfresco. If not, see <http://www.gnu.org/licenses/>.
|
||||
* #L%
|
||||
*/
|
||||
package org.alfresco.transformer.metadataExtractors;
|
||||
|
||||
import org.alfresco.transformer.transformers.SelectableTransformer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.mail.Header;
|
||||
import javax.mail.internet.InternetAddress;
|
||||
import javax.mail.internet.MimeMessage;
|
||||
import javax.mail.internet.MimeMessage.RecipientType;
|
||||
import javax.mail.internet.MimeUtility;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.Serializable;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Date;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Metadata extractor for RFC822 mime emails.
|
||||
*
|
||||
* Configuration: (see HtmlMetadataExtractor_metadata_extract.properties and misc_engine_config.json)
|
||||
*
|
||||
* <pre>
|
||||
* <b>messageFrom:</b> -- imap:messageFrom, cm:originator
|
||||
* <b>messageTo:</b> -- imap:messageTo
|
||||
* <b>messageCc:</b> -- imap:messageCc
|
||||
* <b>messageSubject:</b> -- imap:messageSubject, cm:title, cm:description, cm:subjectline
|
||||
* <b>messageSent:</b> -- imap:dateSent, cm:sentdate
|
||||
* <b>messageReceived:</b> -- imap:dateReceived
|
||||
* <b>All {@link Header#getName() header names}:</b>
|
||||
* <b>Thread-Index:</b> -- imap:threadIndex
|
||||
* <b>Message-ID:</b> -- imap:messageId
|
||||
* </pre>
|
||||
*
|
||||
* @author Derek Hulley
|
||||
* @author adavis
|
||||
*/
|
||||
public class RFC822MetadataExtractor extends AbstractMetadataExtractor implements SelectableTransformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(HtmlMetadataExtractor.class);
|
||||
|
||||
protected static final String KEY_MESSAGE_FROM = "messageFrom";
|
||||
protected static final String KEY_MESSAGE_TO = "messageTo";
|
||||
protected static final String KEY_MESSAGE_CC = "messageCc";
|
||||
protected static final String KEY_MESSAGE_SUBJECT = "messageSubject";
|
||||
protected static final String KEY_MESSAGE_SENT = "messageSent";
|
||||
protected static final String KEY_MESSAGE_RECEIVED = "messageReceived";
|
||||
|
||||
public RFC822MetadataExtractor()
|
||||
{
|
||||
super(logger);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractMetadata(String sourceMimetype, String targetMimetype, Map<String, String> transformOptions,
|
||||
File sourceFile, File targetFile) throws Exception
|
||||
{
|
||||
Map<String, Serializable> metadata = extractMetadata(sourceMimetype, transformOptions, sourceFile);
|
||||
mapMetadataAndWrite(targetFile, metadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Serializable> extractMetadata(String sourceMimetype, Map<String, String> transformOptions,
|
||||
File sourceFile) throws Exception
|
||||
{
|
||||
final Map<String, Serializable> rawProperties = new HashMap<>();
|
||||
|
||||
try (InputStream is = new FileInputStream(sourceFile))
|
||||
{
|
||||
MimeMessage mimeMessage = new MimeMessage(null, is);
|
||||
|
||||
if (mimeMessage != null)
|
||||
{
|
||||
/**
|
||||
* Extract RFC822 values that doesn't match to headers and need to be encoded.
|
||||
* Or those special fields that require some code to extract data
|
||||
*/
|
||||
String tmp = InternetAddress.toString(mimeMessage.getFrom());
|
||||
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
||||
putRawValue(KEY_MESSAGE_FROM, tmp, rawProperties);
|
||||
|
||||
tmp = InternetAddress.toString(mimeMessage.getRecipients(RecipientType.TO));
|
||||
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
||||
putRawValue(KEY_MESSAGE_TO, tmp, rawProperties);
|
||||
|
||||
tmp = InternetAddress.toString(mimeMessage.getRecipients(RecipientType.CC));
|
||||
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
||||
putRawValue(KEY_MESSAGE_CC, tmp, rawProperties);
|
||||
|
||||
putRawValue(KEY_MESSAGE_SENT, mimeMessage.getSentDate(), rawProperties);
|
||||
|
||||
/**
|
||||
* Received field from RFC 822
|
||||
*
|
||||
* "Received" ":" ; one per relay
|
||||
* ["from" domain] ; sending host
|
||||
* ["by" domain] ; receiving host
|
||||
* ["via" atom] ; physical path
|
||||
* ("with" atom) ; link/mail protocol
|
||||
* ["id" msg-id] ; receiver msg id
|
||||
* ["for" addr-spec] ; initial form
|
||||
* ";" date-time ; time received
|
||||
*/
|
||||
Date rxDate = mimeMessage.getReceivedDate();
|
||||
|
||||
if(rxDate != null)
|
||||
{
|
||||
// The email implementation extracted the received date for us.
|
||||
putRawValue(KEY_MESSAGE_RECEIVED, rxDate, rawProperties);
|
||||
}
|
||||
else
|
||||
{
|
||||
// the email implementation did not parse the received date for us.
|
||||
String[] rx = mimeMessage.getHeader("received");
|
||||
if(rx != null && rx.length > 0)
|
||||
{
|
||||
String lastReceived = rx[0];
|
||||
lastReceived = MimeUtility.unfold(lastReceived);
|
||||
int x = lastReceived.lastIndexOf(';');
|
||||
if(x > 0)
|
||||
{
|
||||
String dateStr = lastReceived.substring(x + 1).trim();
|
||||
putRawValue(KEY_MESSAGE_RECEIVED, dateStr, rawProperties);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String[] subj = mimeMessage.getHeader("Subject");
|
||||
if (subj != null && subj.length > 0)
|
||||
{
|
||||
String decodedSubject = subj[0];
|
||||
try
|
||||
{
|
||||
decodedSubject = MimeUtility.decodeText(decodedSubject);
|
||||
}
|
||||
catch (UnsupportedEncodingException e)
|
||||
{
|
||||
logger.warn(e.toString());
|
||||
}
|
||||
putRawValue(KEY_MESSAGE_SUBJECT, decodedSubject, rawProperties);
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract values from all header fields, including extension fields "X-"
|
||||
*/
|
||||
Set<String> keys = getExtractMapping().keySet();
|
||||
@SuppressWarnings("unchecked")
|
||||
Enumeration<Header> headers = mimeMessage.getAllHeaders();
|
||||
while (headers.hasMoreElements())
|
||||
{
|
||||
Header header = (Header) headers.nextElement();
|
||||
if (keys.contains(header.getName()))
|
||||
{
|
||||
tmp = header.getValue();
|
||||
tmp = tmp != null ? MimeUtility.decodeText(tmp) : null;
|
||||
|
||||
putRawValue(header.getName(), tmp, rawProperties);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rawProperties;
|
||||
}
|
||||
}
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -26,7 +26,11 @@
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
@@ -37,12 +41,7 @@ import java.nio.file.StandardCopyOption;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_IMAGE_JPEG;
|
||||
|
||||
/**
|
||||
* Converts Apple iWorks files to JPEGs for thumbnailing and previewing.
|
||||
@@ -74,8 +73,8 @@ public class AppleIWorksContentTransformer implements SelectableTransformer
|
||||
// (53 x 41) preview-micro.jpg
|
||||
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters)
|
||||
public void transform(final String sourceMimetype, final String targetMimetype, final Map<String, String> parameters,
|
||||
final File sourceFile, final File targetFile)
|
||||
{
|
||||
logger.debug("Performing IWorks to jpeg transform with sourceMimetype={} targetMimetype={}",
|
||||
sourceMimetype, targetMimetype);
|
||||
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -26,10 +26,15 @@
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_MULTIPART_ALTERNATIVE;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
import org.alfresco.transformer.fs.FileManager;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.mail.MessagingException;
|
||||
import javax.mail.Multipart;
|
||||
import javax.mail.Part;
|
||||
import javax.mail.Session;
|
||||
import javax.mail.internet.MimeMessage;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
@@ -43,15 +48,9 @@ import java.io.Writer;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
import javax.mail.MessagingException;
|
||||
import javax.mail.Multipart;
|
||||
import javax.mail.Part;
|
||||
import javax.mail.Session;
|
||||
import javax.mail.internet.MimeMessage;
|
||||
|
||||
import org.alfresco.transformer.fs.FileManager;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_HTML;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_MULTIPART_ALTERNATIVE;
|
||||
import static org.alfresco.transform.client.model.Mimetype.MIMETYPE_TEXT_PLAIN;
|
||||
|
||||
/**
|
||||
* Uses javax.mail.MimeMessage to generate plain text versions of RFC822 email
|
||||
@@ -74,8 +73,8 @@ public class EMLTransformer implements SelectableTransformer
|
||||
private static final String DEFAULT_ENCODING = "UTF-8";
|
||||
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters) throws Exception
|
||||
public void transform(final String sourceMimetype, final String targetMimetype, final Map<String, String> parameters,
|
||||
final File sourceFile, final File targetFile) throws Exception
|
||||
{
|
||||
logger.debug("Performing RFC822 to text transform.");
|
||||
// Use try with resource
|
||||
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -26,6 +26,12 @@
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import org.htmlparser.Parser;
|
||||
import org.htmlparser.beans.StringBean;
|
||||
import org.htmlparser.util.ParserException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
@@ -36,12 +42,6 @@ import java.nio.charset.Charset;
|
||||
import java.nio.charset.IllegalCharsetNameException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.htmlparser.Parser;
|
||||
import org.htmlparser.beans.StringBean;
|
||||
import org.htmlparser.util.ParserException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Content transformer which wraps the HTML Parser library for
|
||||
* parsing HTML content.
|
||||
@@ -75,8 +75,8 @@ public class HtmlParserContentTransformer implements SelectableTransformer
|
||||
HtmlParserContentTransformer.class);
|
||||
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters) throws Exception
|
||||
public void transform(final String sourceMimetype, final String targetMimetype, final Map<String, String> parameters,
|
||||
final File sourceFile, final File targetFile) throws Exception
|
||||
{
|
||||
String sourceEncoding = parameters.get(SOURCE_ENCODING);
|
||||
checkEncodingParameter(sourceEncoding, SOURCE_ENCODING);
|
||||
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -26,13 +26,6 @@
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.opc.PackagePart;
|
||||
import org.apache.poi.openxml4j.opc.PackageRelationship;
|
||||
@@ -41,6 +34,13 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Extracts out Thumbnail JPEGs from OOXML files for thumbnailing and previewing.
|
||||
* This transformer will only work for OOXML files where thumbnailing was enabled,
|
||||
@@ -59,8 +59,8 @@ public class OOXMLThumbnailContentTransformer implements SelectableTransformer
|
||||
OOXMLThumbnailContentTransformer.class);
|
||||
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters) throws Exception
|
||||
public void transform(final String sourceMimetype, final String targetMimetype, final Map<String, String> parameters,
|
||||
final File sourceFile, final File targetFile) throws Exception
|
||||
{
|
||||
if (logger.isDebugEnabled())
|
||||
{
|
||||
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -39,14 +39,18 @@ public interface SelectableTransformer
|
||||
String SOURCE_ENCODING = "sourceEncoding";
|
||||
String TARGET_ENCODING = "targetEncoding";
|
||||
|
||||
/**
|
||||
* Implementation of the actual transformation.
|
||||
*
|
||||
* @param sourceFile
|
||||
* @param targetFile
|
||||
* @param parameters
|
||||
* @throws Exception
|
||||
*/
|
||||
void transform(File sourceFile, File targetFile, String sourceMimetype,
|
||||
String targetMimetype, Map<String, String> parameters) throws Exception;
|
||||
default void transform(String sourceMimetype, String targetMimetype, Map<String, String> parameters,
|
||||
File sourceFile, File targetFile) throws Exception
|
||||
{
|
||||
}
|
||||
|
||||
default void extractMetadata(String sourceMimetype, String targetMimetype, Map<String, String> transformOptions,
|
||||
File sourceFile, File targetFile) throws Exception
|
||||
{
|
||||
}
|
||||
|
||||
default void embedMetadata(String sourceMimetype, String targetMimetype, Map<String, String> transformOptions,
|
||||
File sourceFile, File targetFile) throws Exception
|
||||
{
|
||||
}
|
||||
}
|
||||
|
@@ -26,19 +26,17 @@
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import static org.springframework.http.HttpStatus.BAD_REQUEST;
|
||||
import static org.springframework.http.HttpStatus.INTERNAL_SERVER_ERROR;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.alfresco.transformer.executors.Transformer;
|
||||
import org.alfresco.transformer.logging.LogEntry;
|
||||
import org.alfresco.transformer.metadataExtractors.HtmlMetadataExtractor;
|
||||
import org.alfresco.transformer.metadataExtractors.RFC822MetadataExtractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Map;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
import org.alfresco.transform.exceptions.TransformException;
|
||||
import org.alfresco.transformer.logging.LogEntry;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import static org.alfresco.transformer.util.RequestParamMap.TRANSFORM_NAME_PARAMETER;
|
||||
|
||||
/**
|
||||
* The SelectingTransformer selects a registered {@link SelectableTransformer}
|
||||
@@ -46,9 +44,9 @@ import com.google.common.collect.ImmutableMap;
|
||||
*
|
||||
* @author eknizat
|
||||
*/
|
||||
public class SelectingTransformer
|
||||
public class SelectingTransformer implements Transformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(SelectingTransformer.class);
|
||||
private static final String ID = "misc";
|
||||
|
||||
public static final String LICENCE =
|
||||
"This transformer uses libraries from Apache. See the license at http://www.apache.org/licenses/LICENSE-2.0. or in /Apache\\\\ 2.0.txt\\n" +
|
||||
@@ -63,57 +61,45 @@ public class SelectingTransformer
|
||||
.put("textToPdf", new TextToPdfContentTransformer())
|
||||
.put("rfc822", new EMLTransformer())
|
||||
.put("ooXmlThumbnail", new OOXMLThumbnailContentTransformer())
|
||||
.put("HtmlMetadataExtractor", new HtmlMetadataExtractor())
|
||||
.put("RFC822MetadataExtractor", new RFC822MetadataExtractor())
|
||||
.build();
|
||||
|
||||
/**
|
||||
* Performs a transform using a transformer selected based on the provided sourceMimetype and targetMimetype
|
||||
*
|
||||
* @param transform the name of the transformer
|
||||
* @param sourceFile File to transform from
|
||||
* @param targetFile File to transform to
|
||||
* @param sourceMimetype Mimetype of the source file
|
||||
* @throws TransformException if there was a problem internally
|
||||
*/
|
||||
public void transform(String transform, File sourceFile, File targetFile, String sourceMimetype,
|
||||
String targetMimetype, Map<String, String> parameters) throws TransformException
|
||||
@Override
|
||||
public String getTransformerId()
|
||||
{
|
||||
try
|
||||
{
|
||||
final SelectableTransformer transformer = transformers.get(transform);
|
||||
logOptions(sourceFile, targetFile, parameters);
|
||||
transformer.transform(sourceFile, targetFile, sourceMimetype, targetMimetype,
|
||||
parameters);
|
||||
}
|
||||
catch (IllegalArgumentException e)
|
||||
{
|
||||
throw new TransformException(BAD_REQUEST.value(), getMessage(e));
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new TransformException(INTERNAL_SERVER_ERROR.value(), getMessage(e));
|
||||
}
|
||||
if (!targetFile.exists())
|
||||
{
|
||||
throw new TransformException(INTERNAL_SERVER_ERROR.value(),
|
||||
"Transformer failed to create an output file. Target file does not exist.");
|
||||
}
|
||||
if (sourceFile.length() > 0 && targetFile.length() == 0)
|
||||
{
|
||||
throw new TransformException(INTERNAL_SERVER_ERROR.value(),
|
||||
"Transformer failed to create an output file. Target file is empty but source file was not empty.");
|
||||
}
|
||||
return ID;
|
||||
}
|
||||
|
||||
private static String getMessage(Exception e)
|
||||
@Override
|
||||
public void transform(String transformName, String sourceMimetype, String targetMimetype,
|
||||
Map<String, String> transformOptions,
|
||||
File sourceFile, File targetFile) throws Exception
|
||||
{
|
||||
return e.getMessage() == null || e.getMessage().isEmpty() ? e.getClass().getSimpleName() : e.getMessage();
|
||||
final SelectableTransformer transformer = transformers.get(transformName);
|
||||
logOptions(sourceFile, targetFile, transformOptions);
|
||||
transformer.transform(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
|
||||
}
|
||||
|
||||
public void extractMetadata(String transformName, String sourceMimetype, String targetMimetype,
|
||||
Map<String, String> transformOptions,
|
||||
File sourceFile, File targetFile) throws Exception
|
||||
{
|
||||
final SelectableTransformer transformer = transformers.get(transformName);
|
||||
logOptions(sourceFile, targetFile, transformOptions);
|
||||
transformer.extractMetadata(sourceMimetype, targetMimetype, transformOptions, sourceFile, targetFile);
|
||||
}
|
||||
|
||||
private static void logOptions(File sourceFile, File targetFile, Map<String, String> parameters)
|
||||
{
|
||||
StringJoiner sj = new StringJoiner(" ");
|
||||
parameters.forEach((k, v) -> sj.add(
|
||||
"--" + k + "=" + v)); // keeping the existing style used in other T-Engines
|
||||
parameters.forEach((k, v) ->
|
||||
{
|
||||
if (!TRANSFORM_NAME_PARAMETER.equals(k))
|
||||
{
|
||||
sj.add("--" + k + "=" + v);
|
||||
}
|
||||
}); // keeping the existing style used in other T-Engines
|
||||
sj.add(getExtension(sourceFile));
|
||||
sj.add(getExtension(targetFile));
|
||||
LogEntry.setOptions(sj.toString());
|
||||
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -26,6 +26,9 @@
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
@@ -39,9 +42,6 @@ import java.nio.charset.Charset;
|
||||
import java.nio.charset.IllegalCharsetNameException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* Converts any textual format to plain text.
|
||||
* <p>
|
||||
@@ -58,7 +58,7 @@ import org.apache.commons.logging.LogFactory;
|
||||
public class StringExtractingContentTransformer implements SelectableTransformer
|
||||
{
|
||||
|
||||
private static final Log logger = LogFactory.getLog(StringExtractingContentTransformer.class);
|
||||
private static final Logger logger = LoggerFactory.getLogger(StringExtractingContentTransformer.class);
|
||||
|
||||
/**
|
||||
* Text to text conversions are done directly using the content reader and writer string
|
||||
@@ -69,8 +69,8 @@ public class StringExtractingContentTransformer implements SelectableTransformer
|
||||
* be unformatted but valid.
|
||||
*/
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters) throws Exception
|
||||
public void transform(final String sourceMimetype, final String targetMimetype, final Map<String, String> parameters,
|
||||
final File sourceFile, final File targetFile) throws Exception
|
||||
{
|
||||
String sourceEncoding = parameters.get(SOURCE_ENCODING);
|
||||
String targetEncoding = parameters.get(TARGET_ENCODING);
|
||||
@@ -126,11 +126,11 @@ public class StringExtractingContentTransformer implements SelectableTransformer
|
||||
{
|
||||
if (charReader != null)
|
||||
{
|
||||
try { charReader.close(); } catch (Throwable e) { logger.error(e); }
|
||||
try { charReader.close(); } catch (Throwable e) { logger.error("Failed to close charReader", e); }
|
||||
}
|
||||
if (charWriter != null)
|
||||
{
|
||||
try { charWriter.close(); } catch (Throwable e) { logger.error(e); }
|
||||
try { charWriter.close(); } catch (Throwable e) { logger.error("Failed to close charWriter", e); }
|
||||
}
|
||||
}
|
||||
// done
|
||||
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -26,6 +26,15 @@
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import org.alfresco.transformer.util.RequestParamMap;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.tools.TextToPDF;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
@@ -40,14 +49,6 @@ import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.tools.TextToPDF;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* This code is based on a class of the same name originally implemented in alfresco-repository.
|
||||
@@ -62,7 +63,7 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class);
|
||||
|
||||
public static final String PAGE_LIMIT = "pageLimit";
|
||||
public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
|
||||
|
||||
private final PagedTextToPDF transformer;
|
||||
|
||||
@@ -98,8 +99,8 @@ public class TextToPdfContentTransformer implements SelectableTransformer
|
||||
}
|
||||
|
||||
@Override
|
||||
public void transform(final File sourceFile, final File targetFile, final String sourceMimetype,
|
||||
final String targetMimetype, final Map<String, String> parameters) throws Exception
|
||||
public void transform(final String sourceMimetype, final String targetMimetype, final Map<String, String> parameters,
|
||||
final File sourceFile, final File targetFile) throws Exception
|
||||
{
|
||||
String sourceEncoding = parameters.get(SOURCE_ENCODING);
|
||||
String stringPageLimit = parameters.get(PAGE_LIMIT);
|
||||
|
@@ -0,0 +1,12 @@
|
||||
#
|
||||
# HtmlMetadataExtractor - default mapping
|
||||
#
|
||||
# author: Derek Hulley
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
author=cm:author
|
||||
title=cm:title
|
||||
description=cm:description
|
@@ -0,0 +1,22 @@
|
||||
#
|
||||
# RFC822MetadataExtractor - default mapping
|
||||
#
|
||||
|
||||
# Namespaces
|
||||
namespace.prefix.imap=http://www.alfresco.org/model/imap/1.0
|
||||
namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
|
||||
|
||||
# Mappings
|
||||
|
||||
#Default values that doesn't match exactly to Header
|
||||
messageFrom=imap:messageFrom, cm:originator
|
||||
messageTo=imap:messageTo, cm:addressee
|
||||
messageCc=imap:messageCc, cm:addressees
|
||||
messageSubject=imap:messageSubject, cm:title, cm:description, cm:subjectline
|
||||
messageSent=imap:dateSent, cm:sentdate
|
||||
messageReceived=imap:dateReceived
|
||||
|
||||
#Add here any values you want to extract.
|
||||
# Use Header name for key. LHS is a list of the destination properties.
|
||||
Thread-Index=imap:threadIndex
|
||||
Message-ID=imap:messageId
|
@@ -69,6 +69,23 @@
|
||||
],
|
||||
"transformOptions": [
|
||||
]
|
||||
},
|
||||
{
|
||||
"transformerName": "HtmlMetadataExtractor",
|
||||
"supportedSourceAndTargetList": [
|
||||
{"sourceMediaType": "text/html", "targetMediaType": "alfresco-metadata-extract"},
|
||||
{"sourceMediaType": "application/xhtml+xml", "targetMediaType": "alfresco-metadata-extract"}
|
||||
],
|
||||
"transformOptions": [
|
||||
]
|
||||
},
|
||||
{
|
||||
"transformerName": "RFC822MetadataExtractor",
|
||||
"supportedSourceAndTargetList": [
|
||||
{"sourceMediaType": "message/rfc822", "targetMediaType": "alfresco-metadata-extract"}
|
||||
],
|
||||
"transformOptions": [
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -26,8 +26,7 @@
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import static org.alfresco.transformer.transformers.StringExtractingContentTransformer.SOURCE_ENCODING;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
@@ -36,7 +35,8 @@ import java.nio.file.Files;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.Test;
|
||||
import static org.alfresco.transformer.transformers.StringExtractingContentTransformer.SOURCE_ENCODING;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
public class HtmlParserContentTransformerTest
|
||||
{
|
||||
@@ -81,7 +81,7 @@ public class HtmlParserContentTransformerTest
|
||||
|
||||
Map<String, String> parameters = new HashMap<>();
|
||||
parameters.put(SOURCE_ENCODING, "ISO-8859-1");
|
||||
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
|
||||
transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD);
|
||||
|
||||
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
|
||||
tmpS.delete();
|
||||
@@ -94,7 +94,7 @@ public class HtmlParserContentTransformerTest
|
||||
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
|
||||
parameters = new HashMap<>();
|
||||
parameters.put(SOURCE_ENCODING, "UTF-8");
|
||||
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
|
||||
transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD);
|
||||
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
|
||||
tmpS.delete();
|
||||
tmpD.delete();
|
||||
@@ -106,7 +106,7 @@ public class HtmlParserContentTransformerTest
|
||||
tmpD = File.createTempFile("AlfrescoTestTarget_", ".txt");
|
||||
parameters = new HashMap<>();
|
||||
parameters.put(SOURCE_ENCODING, "UTF-16");
|
||||
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
|
||||
transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD);
|
||||
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
|
||||
tmpS.delete();
|
||||
tmpD.delete();
|
||||
@@ -131,7 +131,7 @@ public class HtmlParserContentTransformerTest
|
||||
|
||||
parameters = new HashMap<>();
|
||||
parameters.put(SOURCE_ENCODING, "ISO-8859-1");
|
||||
transformer.transform(tmpS, tmpD, SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters);
|
||||
transformer.transform(SOURCE_MIMETYPE, TARGET_MIMETYPE, parameters, tmpS, tmpD);
|
||||
assertEquals(expected, readFromFile(tmpD, "UTF-8"));
|
||||
tmpS.delete();
|
||||
tmpD.delete();
|
||||
|
@@ -2,7 +2,7 @@
|
||||
* #%L
|
||||
* Alfresco Transform Core
|
||||
* %%
|
||||
* Copyright (C) 2005 - 2019 Alfresco Software Limited
|
||||
* Copyright (C) 2005 - 2020 Alfresco Software Limited
|
||||
* %%
|
||||
* This file is part of the Alfresco software.
|
||||
* -
|
||||
@@ -26,8 +26,10 @@
|
||||
*/
|
||||
package org.alfresco.transformer.transformers;
|
||||
|
||||
import static org.alfresco.transformer.transformers.TextToPdfContentTransformer.PAGE_LIMIT;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
@@ -36,10 +38,8 @@ import java.io.StringWriter;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import static org.alfresco.transformer.util.RequestParamMap.PAGE_LIMIT;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
public class TextToPdfContentTransformerTest
|
||||
{
|
||||
@@ -109,7 +109,7 @@ public class TextToPdfContentTransformerTest
|
||||
// Transform to PDF
|
||||
Map<String, String> parameters = new HashMap<>();
|
||||
parameters.put(PAGE_LIMIT, pageLimit);
|
||||
transformer.transform(sourceFile, targetFile, "text/plain", "application/pdf", parameters);
|
||||
transformer.transform("text/plain", "application/pdf", parameters, sourceFile, targetFile);
|
||||
|
||||
// Read back in the PDF and check it
|
||||
PDDocument doc = PDDocument.load(targetFile);
|
||||
|
Reference in New Issue
Block a user