" + TEXT_P1 + "
" + NEWLINE + - "" + TEXT_P2 + "
" + NEWLINE + - "" + TEXT_P3 + "
" + NEWLINE; - String partC = ""; - final String expected = TITLE + NEWLINE + TEXT_P1 + NEWLINE + TEXT_P2 + NEWLINE + TEXT_P3 + NEWLINE; - - MvcResult result = sendRequest("html", - "UTF-8", - MIMETYPE_HTML, - "txt", - MIMETYPE_TEXT_PLAIN, - null, - null, - null, - expected.getBytes()); - - String contentResult = new String(result.getResponse().getContentAsByteArray(), - targetEncoding); - assertTrue(contentResult.contains(expected), "The content did not include \"" + expected); - } - - @Test - public void testStringToString() throws Exception - { - String expected; - byte[] content; - try - { - content = "azAz10!�$%^&*()\t\r\n".getBytes(UTF_8); - expected = new String(content, "MacDingbat"); - } - catch (UnsupportedEncodingException e) - { - throw new RuntimeException("Encoding not recognised", e); - } - - MvcResult result = sendRequest("txt", - "MacDingbat", - MIMETYPE_TEXT_PLAIN, - "txt", - MIMETYPE_TEXT_PLAIN, - "UTF-8", - null, - null, - content); - - String contentResult = new String(result.getResponse().getContentAsByteArray(), - targetEncoding); - assertTrue(contentResult.contains(expected), "The content did not include \"" + expected); - } - - @Test - public void testEmptyTextFileReturnsEmptyFile() throws Exception - { - // Use empty content to create an empty source file - byte[] content = new byte[0]; - - MvcResult result = sendRequest("txt", - "UTF-8", - MIMETYPE_TEXT_PLAIN, - "txt", - MIMETYPE_TEXT_PLAIN, - "UTF-8", - null, - null, - content); - - assertEquals(0, result.getResponse().getContentLength(), - "Returned content should be empty for an empty source file"); - } - - @Test - public void textToPdf() throws Exception - { - StringBuilder sb = new StringBuilder(); - for (int i = 1; i <= 5; i++) - { - sb.append(Integer.toString(i)); - sb.append(" I must not talk in class or feed my homework to my cat.\n"); - } - sb.append("\nBart\n"); - String expected = sb.toString(); - - MvcResult result = sendRequest("txt", - "UTF-8", - MIMETYPE_TEXT_PLAIN, - "pdf", - MIMETYPE_PDF, - null, - "1", - null, - expected.getBytes()); - - // Read back in the PDF and check it - PDDocument doc = PDDocument.load(result.getResponse().getContentAsByteArray()); - PDFTextStripper textStripper = new PDFTextStripper(); - StringWriter textWriter = new StringWriter(); - textStripper.writeText(doc, textWriter); - doc.close(); - - expected = clean(expected); - String actual = clean(textWriter.toString()); - - assertEquals(expected, actual, "The content did not match."); - } - - @Test - public void testAppleIWorksPages() throws Exception - { - MvcResult result = sendRequest("numbers", null, MIMETYPE_IWORK_NUMBERS, - "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("pages")); - assertTrue(result.getResponse().getContentLengthLong() > 0L, - "Expected image content but content is empty."); - } - - @Test - public void testAppleIWorksNumbers() throws Exception - { - MvcResult result = sendRequest("numbers", null, MIMETYPE_IWORK_NUMBERS, - "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("numbers")); - assertTrue(result.getResponse().getContentLengthLong() > 0L, - "Expected image content but content is empty."); - } - - @Test - public void testAppleIWorksKey() throws Exception - { - MvcResult result = sendRequest("key", null, MIMETYPE_IWORK_KEYNOTE, - "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("key")); - assertTrue(result.getResponse().getContentLengthLong() > 0L, - "Expected image content but content is empty."); - } - - // @Test -// TODO Doesn't work with java 11, enable when fixed - public void testOOXML() throws Exception - { - MvcResult result = sendRequest("docx", null, MIMETYPE_OPENXML_WORDPROCESSING, - "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("docx")); - assertTrue(result.getResponse().getContentLengthLong() > 0L, - "Expected image content but content is empty."); - } - - private MvcResult sendRequest(String sourceExtension, - String sourceEncoding, - String sourceMimetype, - String targetExtension, - String targetMimetype, - String targetEncoding, - String pageLimit, - String extractMapping, - byte[] content) throws Exception - { - final MockMultipartFile sourceFile = new MockMultipartFile("file", - "test_file." + sourceExtension, sourceMimetype, content); - - final MockHttpServletRequestBuilder requestBuilder = super - .mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile) - .param("targetExtension", targetExtension) - .param("targetMimetype", targetMimetype) - .param("sourceMimetype", sourceMimetype); - - // SourceEncoding is available in the options but is not used to select the transformer as it is a known - // like the source mimetype. - if (sourceEncoding != null) - { - requestBuilder.param("sourceEncoding", sourceEncoding); - } - if (targetEncoding != null) - { - requestBuilder.param("targetEncoding", targetEncoding); - } - if (pageLimit != null) - { - requestBuilder.param("pageLimit", pageLimit); - } - if (extractMapping != null) - { - requestBuilder.param("extractMapping", extractMapping); - } - - return mockMvc.perform(requestBuilder) - .andExpect(status().is(OK.value())) - .andExpect(header().string("Content-Disposition", - "attachment; filename*= " + - (targetEncoding == null ? "UTF-8" : targetEncoding) + - "''test_file." + targetExtension)) - .andReturn(); - } - - private String clean(String text) - { - text = text.replaceAll("\\s+\\r", ""); - text = text.replaceAll("\\s+\\n", ""); - text = text.replaceAll("\\r", ""); - text = text.replaceAll("\\n", ""); - return text; - } - - @Test - @Override - public void queueTransformRequestUsingDirectAccessUrlTest() throws Exception - { - super.targetMimetype = this.targetMimetype; - super.queueTransformRequestUsingDirectAccessUrlTest(); - } -} \ No newline at end of file +/* + * #%L + * Alfresco Transform Core + * %% + * Copyright (C) 2005 - 2022 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * - + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * - + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * - + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * - + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see" + TEXT_P1 + "
" + NEWLINE + + "" + TEXT_P2 + "
" + NEWLINE + + "" + TEXT_P3 + "
" + NEWLINE; + String partC = ""; + final String expected = TITLE + NEWLINE + TEXT_P1 + NEWLINE + TEXT_P2 + NEWLINE + TEXT_P3 + NEWLINE; + + MvcResult result = sendRequest("html", + "UTF-8", + MIMETYPE_HTML, + "txt", + MIMETYPE_TEXT_PLAIN, + null, + null, + null, + expected.getBytes()); + + String contentResult = new String(result.getResponse().getContentAsByteArray(), + targetEncoding); + assertTrue(contentResult.contains(expected), "The content did not include \"" + expected); + } + + @Test + public void testStringToString() throws Exception + { + String expected; + byte[] content; + try + { + content = "azAz10!�$%^&*()\t\r\n".getBytes(UTF_8); + expected = new String(content, "MacDingbat"); + } + catch (UnsupportedEncodingException e) + { + throw new RuntimeException("Encoding not recognised", e); + } + + MvcResult result = sendRequest("txt", + "MacDingbat", + MIMETYPE_TEXT_PLAIN, + "txt", + MIMETYPE_TEXT_PLAIN, + "UTF-8", + null, + null, + content); + + String contentResult = new String(result.getResponse().getContentAsByteArray(), + targetEncoding); + assertTrue(contentResult.contains(expected), "The content did not include \"" + expected); + } + + @Test + public void testEmptyTextFileReturnsEmptyFile() throws Exception + { + // Use empty content to create an empty source file + byte[] content = new byte[0]; + + MvcResult result = sendRequest("txt", + "UTF-8", + MIMETYPE_TEXT_PLAIN, + "txt", + MIMETYPE_TEXT_PLAIN, + "UTF-8", + null, + null, + content); + + assertEquals(0, result.getResponse().getContentLength(), + "Returned content should be empty for an empty source file"); + } + + @Test + public void textToPdf() throws Exception + { + StringBuilder sb = new StringBuilder(); + for (int i = 1; i <= 5; i++) + { + sb.append(Integer.toString(i)); + sb.append(" I must not talk in class or feed my homework to my cat.\n"); + } + sb.append("\nBart\n"); + String expected = sb.toString(); + + MvcResult result = sendRequest("txt", + "UTF-8", + MIMETYPE_TEXT_PLAIN, + "pdf", + MIMETYPE_PDF, + null, + "1", + null, + expected.getBytes()); + + // Read back in the PDF and check it + PDDocument doc = PDDocument.load(result.getResponse().getContentAsByteArray()); + PDFTextStripper textStripper = new PDFTextStripper(); + StringWriter textWriter = new StringWriter(); + textStripper.writeText(doc, textWriter); + doc.close(); + + expected = clean(expected); + String actual = clean(textWriter.toString()); + + assertEquals(expected, actual, "The content did not match."); + } + + @Test + public void testAppleIWorksPages() throws Exception + { + MvcResult result = sendRequest("numbers", null, MIMETYPE_IWORK_NUMBERS, + "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("pages")); + assertTrue(result.getResponse().getContentLengthLong() > 0L, + "Expected image content but content is empty."); + } + + @Test + public void testAppleIWorksNumbers() throws Exception + { + MvcResult result = sendRequest("numbers", null, MIMETYPE_IWORK_NUMBERS, + "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("numbers")); + assertTrue(result.getResponse().getContentLengthLong() > 0L, + "Expected image content but content is empty."); + } + + @Test + public void testAppleIWorksKey() throws Exception + { + MvcResult result = sendRequest("key", null, MIMETYPE_IWORK_KEYNOTE, + "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("key")); + assertTrue(result.getResponse().getContentLengthLong() > 0L, + "Expected image content but content is empty."); + } + + // @Test +// TODO Doesn't work with java 11, enable when fixed + public void testOOXML() throws Exception + { + MvcResult result = sendRequest("docx", null, MIMETYPE_OPENXML_WORDPROCESSING, + "jpeg", MIMETYPE_IMAGE_JPEG, null, null, null, readTestFile("docx")); + assertTrue(result.getResponse().getContentLengthLong() > 0L, + "Expected image content but content is empty."); + } + + private MvcResult sendRequest(String sourceExtension, + String sourceEncoding, + String sourceMimetype, + String targetExtension, + String targetMimetype, + String targetEncoding, + String pageLimit, + String extractMapping, + byte[] content) throws Exception + { + final MockMultipartFile sourceFile = new MockMultipartFile("file", + "test_file." + sourceExtension, sourceMimetype, content); + + final MockHttpServletRequestBuilder requestBuilder = super + .mockMvcRequest(ENDPOINT_TRANSFORM, sourceFile) + .param("targetExtension", targetExtension) + .param("targetMimetype", targetMimetype) + .param("sourceMimetype", sourceMimetype); + + // SourceEncoding is available in the options but is not used to select the transformer as it is a known + // like the source mimetype. + if (sourceEncoding != null) + { + requestBuilder.param("sourceEncoding", sourceEncoding); + } + if (targetEncoding != null) + { + requestBuilder.param("targetEncoding", targetEncoding); + } + if (pageLimit != null) + { + requestBuilder.param("pageLimit", pageLimit); + } + if (extractMapping != null) + { + requestBuilder.param("extractMapping", extractMapping); + } + + return mockMvc.perform(requestBuilder) + .andExpect(status().is(OK.value())) + .andExpect(header().string("Content-Disposition", + "attachment; filename*= " + + (targetEncoding == null ? "UTF-8" : targetEncoding) + + "''test_file." + targetExtension)) + .andReturn(); + } + + private String clean(String text) + { + text = text.replaceAll("\\s+\\r", ""); + text = text.replaceAll("\\s+\\n", ""); + text = text.replaceAll("\\r", ""); + text = text.replaceAll("\\n", ""); + return text; + } + + @Test + @Override + public void queueTransformRequestUsingDirectAccessUrlTest() throws Exception + { + super.targetMimetype = this.targetMimetype; + super.queueTransformRequestUsingDirectAccessUrlTest(); + } +} diff --git a/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscQueueTransformServiceIT.java b/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscQueueTransformServiceIT.java index 6e4479a6..20e90e53 100644 --- a/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscQueueTransformServiceIT.java +++ b/alfresco-transform-misc/alfresco-transform-misc-boot/src/test/java/org/alfresco/transformer/MiscQueueTransformServiceIT.java @@ -1,55 +1,55 @@ -/* - * #%L - * Alfresco Transform Core - * %% - * Copyright (C) 2005 - 2021 Alfresco Software Limited - * %% - * This file is part of the Alfresco software. - * - - * If the software was purchased under a paid Alfresco license, the terms of - * the paid license agreement will prevail. Otherwise, the software is - * provided under the following open source license terms: - * - - * Alfresco is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - - * Alfresco is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - - * You should have received a copy of the GNU Lesser General Public License - * along with Alfresco. If not, see- * This code is based on a class of the same name originally implemented in alfresco-repository. - *
- * - * @author Neil Mc Erlean - * @author eknizat - * @since 4.0 - */ -public class AppleIWorksContentTransformer implements SelectableTransformer -{ - private static final Logger logger = LoggerFactory.getLogger( - AppleIWorksContentTransformer.class); - - // Apple's zip entry names for previews in iWorks have changed over time. - private static final List+ * This code is based on a class of the same name originally implemented in alfresco-repository. + *
+ * + * @author Neil Mc Erlean + * @author eknizat + * @since 4.0 + */ +public class AppleIWorksContentTransformer implements SelectableTransformer +{ + private static final Logger logger = LoggerFactory.getLogger( + AppleIWorksContentTransformer.class); + + // Apple's zip entry names for previews in iWorks have changed over time. + private static final List- * This code is based on a class of the same name originally implemented in alfresco-repository. - *
- */ -public class EMLTransformer implements SelectableTransformer - -{ - private static final Logger logger = LoggerFactory.getLogger(EMLTransformer.class); - - private static final String CHARSET = "charset"; - private static final String DEFAULT_ENCODING = "UTF-8"; - - @Override - public void transform(final String sourceMimetype, final String targetMimetype, final Map+ * This code is based on a class of the same name originally implemented in alfresco-repository. + *
+ */ +public class EMLTransformer implements SelectableTransformer + +{ + private static final Logger logger = LoggerFactory.getLogger(EMLTransformer.class); + + private static final String CHARSET = "charset"; + private static final String DEFAULT_ENCODING = "UTF-8"; + + @Override + public void transform(final String sourceMimetype, final String targetMimetype, final Map- * This code is based on a class of the same name originally implemented in alfresco-repository. - *
- * - *- * Since HTML Parser was updated from v1.6 to v2.1, META tags - * defining an encoding for the content via http-equiv=Content-Type - * will ONLY be respected if the encoding of the content item - * itself is set to ISO-8859-1. - *
- * - *- * Tika Note - could be converted to use the Tika HTML parser, - * but we'd potentially need a custom text handler to replicate - * the current settings around links and non-breaking spaces. - *
- * - * @author Derek Hulley - * @author eknizat - * @see http://htmlparser.sourceforge.net - * @see org.htmlparser.beans.StringBean - * @see HTML Parser - */ -public class HtmlParserContentTransformer implements SelectableTransformer -{ - private static final Logger logger = LoggerFactory.getLogger( - HtmlParserContentTransformer.class); - - @Override - public void transform(final String sourceMimetype, final String targetMimetype, final Map- * This code is based on a class of the same name, originally implemented in alfresco-repository. - *
- * - * A version of {@link StringBean} which allows control of the - * encoding in the underlying HTML Parser. - * Unfortunately, StringBean doesn't allow easy over-riding of - * this, so we have to duplicate some code to control this. - * This allows us to correctly handle HTML files where the encoding - * is specified against the content property (rather than in the - * HTML Head Meta), see ALF-10466 for details. - */ - public static class EncodingAwareStringBean extends StringBean - { - private static final long serialVersionUID = -9033414360428669553L; - - /** - * Sets the File to extract strings from, and the encoding - * it's in (if known to Alfresco) - * - * @param file The File that text should be fetched from. - * @param encoding The encoding of the input - */ - public void setURL(File file, String encoding) - { - String previousURL = getURL(); - String newURL = file.getAbsolutePath(); - - if (previousURL == null || !newURL.equals(previousURL)) - { - try - { - URLConnection conn = getConnection(); - - if (null == mParser) - { - mParser = new Parser(newURL); - } - else - { - mParser.setURL(newURL); - } - - if (encoding != null) - { - mParser.setEncoding(encoding); - } - - mPropertySupport.firePropertyChange(StringBean.PROP_URL_PROPERTY, previousURL, - getURL()); - mPropertySupport.firePropertyChange(StringBean.PROP_CONNECTION_PROPERTY, conn, - mParser.getConnection()); - setStrings(); - } - catch (ParserException pe) - { - updateStrings(pe.toString()); - } - } - } - - public String getEncoding() - { - return mParser.getEncoding(); - } - } -} +/* + * #%L + * Alfresco Transform Core + * %% + * Copyright (C) 2005 - 2022 Alfresco Software Limited + * %% + * This file is part of the Alfresco software. + * - + * If the software was purchased under a paid Alfresco license, the terms of + * the paid license agreement will prevail. Otherwise, the software is + * provided under the following open source license terms: + * - + * Alfresco is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * - + * Alfresco is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * - + * You should have received a copy of the GNU Lesser General Public License + * along with Alfresco. If not, see+ * This code is based on a class of the same name originally implemented in alfresco-repository. + *
+ * + *+ * Since HTML Parser was updated from v1.6 to v2.1, META tags + * defining an encoding for the content via http-equiv=Content-Type + * will ONLY be respected if the encoding of the content item + * itself is set to ISO-8859-1. + *
+ * + *+ * Tika Note - could be converted to use the Tika HTML parser, + * but we'd potentially need a custom text handler to replicate + * the current settings around links and non-breaking spaces. + *
+ * + * @author Derek Hulley + * @author eknizat + * @see http://htmlparser.sourceforge.net + * @see org.htmlparser.beans.StringBean + * @see HTML Parser + */ +public class HtmlParserContentTransformer implements SelectableTransformer +{ + private static final Logger logger = LoggerFactory.getLogger( + HtmlParserContentTransformer.class); + + @Override + public void transform(final String sourceMimetype, final String targetMimetype, final Map+ * This code is based on a class of the same name, originally implemented in alfresco-repository. + *
+ * + * A version of {@link StringBean} which allows control of the + * encoding in the underlying HTML Parser. + * Unfortunately, StringBean doesn't allow easy over-riding of + * this, so we have to duplicate some code to control this. + * This allows us to correctly handle HTML files where the encoding + * is specified against the content property (rather than in the + * HTML Head Meta), see ALF-10466 for details. + */ + public static class EncodingAwareStringBean extends StringBean + { + private static final long serialVersionUID = -9033414360428669553L; + + /** + * Sets the File to extract strings from, and the encoding + * it's in (if known to Alfresco) + * + * @param file The File that text should be fetched from. + * @param encoding The encoding of the input + */ + public void setURL(File file, String encoding) + { + String previousURL = getURL(); + String newURL = file.getAbsolutePath(); + + if (previousURL == null || !newURL.equals(previousURL)) + { + try + { + URLConnection conn = getConnection(); + + if (null == mParser) + { + mParser = new Parser(newURL); + } + else + { + mParser.setURL(newURL); + } + + if (encoding != null) + { + mParser.setEncoding(encoding); + } + + mPropertySupport.firePropertyChange(StringBean.PROP_URL_PROPERTY, previousURL, + getURL()); + mPropertySupport.firePropertyChange(StringBean.PROP_CONNECTION_PROPERTY, conn, + mParser.getConnection()); + setStrings(); + } + catch (ParserException pe) + { + updateStrings(pe.toString()); + } + } + } + + public String getEncoding() + { + return mParser.getEncoding(); + } + } +} diff --git a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/OOXMLThumbnailContentTransformer.java b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/OOXMLThumbnailContentTransformer.java index baed9694..29e092c9 100644 --- a/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/OOXMLThumbnailContentTransformer.java +++ b/alfresco-transform-misc/alfresco-transform-misc/src/main/java/org/alfresco/transformer/transformers/OOXMLThumbnailContentTransformer.java @@ -1,130 +1,130 @@ -/* - * #%L - * Alfresco Transform Core - * %% - * Copyright (C) 2005 - 2020 Alfresco Software Limited - * %% - * This file is part of the Alfresco software. - * - - * If the software was purchased under a paid Alfresco license, the terms of - * the paid license agreement will prevail. Otherwise, the software is - * provided under the following open source license terms: - * - - * Alfresco is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - - * Alfresco is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - - * You should have received a copy of the GNU Lesser General Public License - * along with Alfresco. If not, see- * This code is based on a class of the same name originally implemented in alfresco-repository. - *
- * - * @author Nick Burch - * @author eknizat - */ -public class OOXMLThumbnailContentTransformer implements SelectableTransformer -{ - private static final Logger logger = LoggerFactory.getLogger( - OOXMLThumbnailContentTransformer.class); - - @Override - public void transform(final String sourceMimetype, final String targetMimetype, final Map+ * This code is based on a class of the same name originally implemented in alfresco-repository. + *
+ * + * @author Nick Burch + * @author eknizat + */ +public class OOXMLThumbnailContentTransformer implements SelectableTransformer +{ + private static final Logger logger = LoggerFactory.getLogger( + OOXMLThumbnailContentTransformer.class); + + @Override + public void transform(final String sourceMimetype, final String targetMimetype, final Map- * The transformation is sensitive to the source and target string encodings. - * - * - *
- * This code is based on a class of the same name originally implemented in alfresco-repository. - *
- * - * @author Derek Hulley - * @author eknizat - */ -public class StringExtractingContentTransformer implements SelectableTransformer -{ - - private static final Logger logger = LoggerFactory.getLogger(StringExtractingContentTransformer.class); - - /** - * Text to text conversions are done directly using the content reader and writer string - * manipulation methods. - *
- * Extraction of text from binary content attempts to take the possible character
- * encoding into account. The text produced from this will, if the encoding was correct,
- * be unformatted but valid.
- */
- @Override
- public void transform(final String sourceMimetype, final String targetMimetype, final Map
+ * The transformation is sensitive to the source and target string encodings.
+ *
+ *
+ *
+ * This code is based on a class of the same name originally implemented in alfresco-repository.
+ *
+ * Extraction of text from binary content attempts to take the possible character
+ * encoding into account. The text produced from this will, if the encoding was correct,
+ * be unformatted but valid.
+ */
+ @Override
+ public void transform(final String sourceMimetype, final String targetMimetype, final Map
- * This code is based on a class of the same name originally implemented in alfresco-repository.
- *
+ * This code is based on a class of the same name originally implemented in alfresco-repository.
+ * " + TEXT_P1 + " " + TEXT_P2 + " " + TEXT_P3 + " " + TEXT_P1 + " " + TEXT_P2 + " " + TEXT_P3 + "
- *
- * @param dateStrings
- * @return dateStrings in Iso8601 format
- * @see #iptcToIso8601DateString
- */
- protected String[] iptcToIso8601DateStrings(String[] dateStrings)
- {
- for (int i = 0; i < dateStrings.length; i++)
- {
- dateStrings[i] = iptcToIso8601DateString(dateStrings[i]);
- }
- return dateStrings;
- }
-
- /**
- * Converts a date or date time string into Iso8601 format
- * Converts any ':' in the year portion of a date string characters to '-'.
- * Expects the year in the format YYYY:MM:DD or YYYY-MM-DD
- * Will add the correct delimiter, 'T', to any dateTime strings, where | can be any char other than ,'T':
- * YYYY:MM:DD|HH:mm:ss.... or YYYY-MM-DD|HH:mm:ss....
- *
- * Examples:
+ *
+ * @param dateStrings
+ * @return dateStrings in Iso8601 format
+ * @see #iptcToIso8601DateString
+ */
+ protected String[] iptcToIso8601DateStrings(String[] dateStrings)
+ {
+ for (int i = 0; i < dateStrings.length; i++)
+ {
+ dateStrings[i] = iptcToIso8601DateString(dateStrings[i]);
+ }
+ return dateStrings;
+ }
+
+ /**
+ * Converts a date or date time string into Iso8601 format
+ * Converts any ':' in the year portion of a date string characters to '-'.
+ * Expects the year in the format YYYY:MM:DD or YYYY-MM-DD
+ * Will add the correct delimiter, 'T', to any dateTime strings, where | can be any char other than ,'T':
+ * YYYY:MM:DD|HH:mm:ss.... or YYYY-MM-DD|HH:mm:ss....
+ *
+ * Examples:
- * Executes the configured external command and passes the given document
- * stream as a simple XHTML document to the given SAX content handler.
- * Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
- * has been called to set patterns.
- */
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-
- MediaType mediaType = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
- TemporaryResources tmp = new TemporaryResources();
- try {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
- if (this.getSupportedTypes().contains(mediaType)) {
- parse(tis, xhtml, metadata, tmp);
- }
- switch (mediaType.getType()+"/"+mediaType.getSubtype()) {
- case MIMETYPE_IMAGE_JPEG:
- parseAdditional(new JpegParser(), tis, handler, metadata, context, mediaType);
- break;
- case MIMETYPE_IMAGE_TIFF:
- parseAdditional(new TiffParser(), tis, handler, metadata, context, mediaType);
- break;
- default:
- parseAdditional(new ImageParser(), tis, handler, metadata, context, mediaType);
- }
- } finally {
- tmp.dispose();
- }
- }
-
- private void parseAdditional(Parser parser, TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context,
- MediaType mediaType) throws IOException, SAXException, TikaException {
- if (parser.getSupportedTypes(context).contains(mediaType)) {
- parser.parse(tis, handler, metadata, context);
- }
- }
-
- private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, TemporaryResources tmp)
- throws IOException, SAXException, TikaException {
- boolean inputToStdIn = true;
- boolean outputFromStdOut = true;
- boolean hasPatterns = (getMetadataExtractionPatterns() != null && !getMetadataExtractionPatterns().isEmpty());
-
- File output = null;
-
- // Build our getCommand()
- String[] cmd;
- if (getCommand().length == 1) {
- cmd = getCommand()[0].split(" ");
- } else {
- cmd = new String[getCommand().length];
- System.arraycopy(getCommand(), 0, cmd, 0, getCommand().length);
- }
- for (int i = 0; i < cmd.length; i++) {
- if (cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
- cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, stream.getFile().getPath());
- inputToStdIn = false;
- }
- if (cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
- output = tmp.createTemporaryFile();
- outputFromStdOut = false;
- cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
- }
- }
-
- // Execute
- Process process = null;
- try {
- if (cmd.length == 1) {
- process = Runtime.getRuntime().exec(cmd[0]);
- } else {
- process = Runtime.getRuntime().exec(cmd);
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
-
- try {
- if (inputToStdIn) {
- sendInput(process, stream);
- } else {
- process.getOutputStream().close();
- }
-
- InputStream out = process.getInputStream();
- InputStream err = process.getErrorStream();
-
- if (hasPatterns) {
-
- if (outputFromStdOut) {
- extractOutput(out, xhtml);
- } else {
- extractMetadata(out, metadata);
- }
- } else {
- ignoreStream(err);
-
- if (outputFromStdOut) {
- extractOutput(out, xhtml);
- } else {
- ignoreStream(out);
- }
- }
- } finally {
- try {
- process.waitFor();
- } catch (InterruptedException ignore) {
- }
- }
-
- // Grab the output if we haven't already
- if (!outputFromStdOut) {
- extractOutput(new FileInputStream(output), xhtml);
- }
- }
-
- /**
- * Adapted from {@link org.apache.tika.parser.external.ExternalParser}
- * Starts a thread that extracts the contents of the standard output
- * stream of the given process to the given XHTML content handler.
- * The standard output stream is closed once fully processed.
- *
- * @param stream stream
- * @param xhtml XHTML content handler
- * @throws SAXException if the XHTML SAX events could not be handled
- * @throws IOException if an input error occurred
- */
- private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
- try (Reader reader = new InputStreamReader(stream, UTF_8)) {
- xhtml.startDocument();
- xhtml.startElement("p");
- char[] buffer = new char[1024];
- for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
- xhtml.characters(buffer, 0, n);
- }
- xhtml.endElement("p");
- xhtml.endDocument();
- }
- }
-
- /**
- * Adapted from {@link org.apache.tika.parser.external.ExternalParser}
- * Starts a thread that sends the contents of the given input stream
- * to the standard input stream of the given process. Potential
- * exceptions are ignored, and the standard input stream is closed
- * once fully processed. Note that the given input stream is not
- * closed by this method.
- *
- * @param process process
- * @param stream input stream
- */
- private void sendInput(final Process process, final InputStream stream) {
- Thread t = new Thread() {
- public void run() {
- OutputStream stdin = process.getOutputStream();
- try {
- IOUtils.copy(stream, stdin);
- } catch (IOException e) {
- }
- }
- };
- t.start();
- try {
- t.join();
- } catch (InterruptedException ignore) {
- }
- }
-
- /**
- * Adapted from {@link org.apache.tika.parser.external.ExternalParser}
- * Starts a thread that reads and discards the contents of the
- * standard stream of the given process. Potential exceptions
- * are ignored, and the stream is closed once fully processed.
- *
- * @param stream stream
- */
- private void ignoreStream(final InputStream stream) {
- Thread t = new Thread() {
- public void run() {
- try {
- IOUtils.copy(stream, NullOutputStream.NULL_OUTPUT_STREAM);
- } catch (IOException e) {
- } finally {
- IOUtils.closeQuietly(stream);
- }
- }
- };
- t.start();
- try {
- t.join();
- } catch (InterruptedException ignore) {
- }
- }
-
- private void extractMetadata(final InputStream stream, final Metadata metadata) {
- Thread t = new Thread() {
- public void run() {
- BufferedReader reader;
- reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
- try {
- String line;
- while ((line = reader.readLine()) != null) {
- for (Pattern p : getMetadataExtractionPatterns().keySet()) {
- Matcher m = p.matcher(line);
- if (m.find()) {
- if (getMetadataExtractionPatterns().get(p) != null
- && !getMetadataExtractionPatterns().get(p).equals("")) {
- metadata.add(getMetadataExtractionPatterns().get(p), m.group(1));
- } else {
- metadata.add(m.group(1), m.group(2));
- }
- }
- }
- }
- } catch (IOException e) {
- // Ignore
- } finally {
- IOUtils.closeQuietly(reader);
- IOUtils.closeQuietly(stream);
- }
- }
- };
- t.start();
- try {
- t.join();
- } catch (InterruptedException ignore) {
- }
- }
-}
+/*
+ * #%L
+ * Alfresco Transform Core
+ * %%
+ * Copyright (C) 2005 - 2021 Alfresco Software Limited
+ * %%
+ * This file is part of the Alfresco software.
+ * -
+ * If the software was purchased under a paid Alfresco license, the terms of
+ * the paid license agreement will prevail. Otherwise, the software is
+ * provided under the following open source license terms:
+ * -
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * -
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ * -
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see
+ * Executes the configured external command and passes the given document
+ * stream as a simple XHTML document to the given SAX content handler.
+ * Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
+ * has been called to set patterns.
+ */
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ MediaType mediaType = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ if (this.getSupportedTypes().contains(mediaType)) {
+ parse(tis, xhtml, metadata, tmp);
+ }
+ switch (mediaType.getType()+"/"+mediaType.getSubtype()) {
+ case MIMETYPE_IMAGE_JPEG:
+ parseAdditional(new JpegParser(), tis, handler, metadata, context, mediaType);
+ break;
+ case MIMETYPE_IMAGE_TIFF:
+ parseAdditional(new TiffParser(), tis, handler, metadata, context, mediaType);
+ break;
+ default:
+ parseAdditional(new ImageParser(), tis, handler, metadata, context, mediaType);
+ }
+ } finally {
+ tmp.dispose();
+ }
+ }
+
+ private void parseAdditional(Parser parser, TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context,
+ MediaType mediaType) throws IOException, SAXException, TikaException {
+ if (parser.getSupportedTypes(context).contains(mediaType)) {
+ parser.parse(tis, handler, metadata, context);
+ }
+ }
+
+ private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, TemporaryResources tmp)
+ throws IOException, SAXException, TikaException {
+ boolean inputToStdIn = true;
+ boolean outputFromStdOut = true;
+ boolean hasPatterns = (getMetadataExtractionPatterns() != null && !getMetadataExtractionPatterns().isEmpty());
+
+ File output = null;
+
+ // Build our getCommand()
+ String[] cmd;
+ if (getCommand().length == 1) {
+ cmd = getCommand()[0].split(" ");
+ } else {
+ cmd = new String[getCommand().length];
+ System.arraycopy(getCommand(), 0, cmd, 0, getCommand().length);
+ }
+ for (int i = 0; i < cmd.length; i++) {
+ if (cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
+ cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, stream.getFile().getPath());
+ inputToStdIn = false;
+ }
+ if (cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
+ output = tmp.createTemporaryFile();
+ outputFromStdOut = false;
+ cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
+ }
+ }
+
+ // Execute
+ Process process = null;
+ try {
+ if (cmd.length == 1) {
+ process = Runtime.getRuntime().exec(cmd[0]);
+ } else {
+ process = Runtime.getRuntime().exec(cmd);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ try {
+ if (inputToStdIn) {
+ sendInput(process, stream);
+ } else {
+ process.getOutputStream().close();
+ }
+
+ InputStream out = process.getInputStream();
+ InputStream err = process.getErrorStream();
+
+ if (hasPatterns) {
+
+ if (outputFromStdOut) {
+ extractOutput(out, xhtml);
+ } else {
+ extractMetadata(out, metadata);
+ }
+ } else {
+ ignoreStream(err);
+
+ if (outputFromStdOut) {
+ extractOutput(out, xhtml);
+ } else {
+ ignoreStream(out);
+ }
+ }
+ } finally {
+ try {
+ process.waitFor();
+ } catch (InterruptedException ignore) {
+ }
+ }
+
+ // Grab the output if we haven't already
+ if (!outputFromStdOut) {
+ extractOutput(new FileInputStream(output), xhtml);
+ }
+ }
+
+ /**
+ * Adapted from {@link org.apache.tika.parser.external.ExternalParser}
+ * Starts a thread that extracts the contents of the standard output
+ * stream of the given process to the given XHTML content handler.
+ * The standard output stream is closed once fully processed.
+ *
+ * @param stream stream
+ * @param xhtml XHTML content handler
+ * @throws SAXException if the XHTML SAX events could not be handled
+ * @throws IOException if an input error occurred
+ */
+ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
+ try (Reader reader = new InputStreamReader(stream, UTF_8)) {
+ xhtml.startDocument();
+ xhtml.startElement("p");
+ char[] buffer = new char[1024];
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+ xhtml.characters(buffer, 0, n);
+ }
+ xhtml.endElement("p");
+ xhtml.endDocument();
+ }
+ }
+
+ /**
+ * Adapted from {@link org.apache.tika.parser.external.ExternalParser}
+ * Starts a thread that sends the contents of the given input stream
+ * to the standard input stream of the given process. Potential
+ * exceptions are ignored, and the standard input stream is closed
+ * once fully processed. Note that the given input stream is not
+ * closed by this method.
+ *
+ * @param process process
+ * @param stream input stream
+ */
+ private void sendInput(final Process process, final InputStream stream) {
+ Thread t = new Thread() {
+ public void run() {
+ OutputStream stdin = process.getOutputStream();
+ try {
+ IOUtils.copy(stream, stdin);
+ } catch (IOException e) {
+ }
+ }
+ };
+ t.start();
+ try {
+ t.join();
+ } catch (InterruptedException ignore) {
+ }
+ }
+
+ /**
+ * Adapted from {@link org.apache.tika.parser.external.ExternalParser}
+ * Starts a thread that reads and discards the contents of the
+ * standard stream of the given process. Potential exceptions
+ * are ignored, and the stream is closed once fully processed.
+ *
+ * @param stream stream
+ */
+ private void ignoreStream(final InputStream stream) {
+ Thread t = new Thread() {
+ public void run() {
+ try {
+ IOUtils.copy(stream, NullOutputStream.NULL_OUTPUT_STREAM);
+ } catch (IOException e) {
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ }
+ };
+ t.start();
+ try {
+ t.join();
+ } catch (InterruptedException ignore) {
+ }
+ }
+
+ private void extractMetadata(final InputStream stream, final Metadata metadata) {
+ Thread t = new Thread() {
+ public void run() {
+ BufferedReader reader;
+ reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
+ try {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ for (Pattern p : getMetadataExtractionPatterns().keySet()) {
+ Matcher m = p.matcher(line);
+ if (m.find()) {
+ if (getMetadataExtractionPatterns().get(p) != null
+ && !getMetadataExtractionPatterns().get(p).equals("")) {
+ metadata.add(getMetadataExtractionPatterns().get(p), m.group(1));
+ } else {
+ metadata.add(m.group(1), m.group(2));
+ }
+ }
+ }
+ }
+ } catch (IOException e) {
+ // Ignore
+ } finally {
+ IOUtils.closeQuietly(reader);
+ IOUtils.closeQuietly(stream);
+ }
+ }
+ };
+ t.start();
+ try {
+ t.join();
+ } catch (InterruptedException ignore) {
+ }
+ }
+}
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/DWGMetadataExtractor_metadata_extract.properties b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/DWGMetadataExtractor_metadata_extract.properties
index 6c28f692..abff3187 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/DWGMetadataExtractor_metadata_extract.properties
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/DWGMetadataExtractor_metadata_extract.properties
@@ -1,12 +1,12 @@
-#
-# DWGMetadataExtracter - default mapping
-#
-# author: Nick Burch
-
-# Namespaces
-namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
-
-# Mappings
-author=cm:author
-title=cm:title
-description=cm:description
+#
+# DWGMetadataExtracter - default mapping
+#
+# author: Nick Burch
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+
+# Mappings
+author=cm:author
+title=cm:title
+description=cm:description
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/MP3MetadataExtractor_metadata_extract.properties b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/MP3MetadataExtractor_metadata_extract.properties
index eba36d7d..57e8130e 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/MP3MetadataExtractor_metadata_extract.properties
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/MP3MetadataExtractor_metadata_extract.properties
@@ -1,30 +1,30 @@
-#
-# MP3MetadataExtracter - default mapping
-#
-# author: Derek Hulley
-
-# Namespaces
-namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
-namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
-
-# Core mappings
-author=cm:author
-title=cm:title
-description=cm:description
-created=cm:created
-
-# Audio descriptive mappings
-xmpDM\:album=audio:album
-xmpDM\:artist=audio:artist
-xmpDM\:composer=audio:composer
-xmpDM\:engineer=audio:engineer
-xmpDM\:genre=audio:genre
-xmpDM\:trackNumber=audio:trackNumber
-xmpDM\:releaseDate=audio:releaseDate
-#xmpDM:logComment
-
-# Audio specific mappings
-xmpDM\:audioSampleRate=audio:sampleRate
-xmpDM\:audioSampleType=audio:sampleType
-xmpDM\:audioChannelType=audio:channelType
-xmpDM\:audioCompressor=audio:compressor
+#
+# MP3MetadataExtracter - default mapping
+#
+# author: Derek Hulley
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
+
+# Core mappings
+author=cm:author
+title=cm:title
+description=cm:description
+created=cm:created
+
+# Audio descriptive mappings
+xmpDM\:album=audio:album
+xmpDM\:artist=audio:artist
+xmpDM\:composer=audio:composer
+xmpDM\:engineer=audio:engineer
+xmpDM\:genre=audio:genre
+xmpDM\:trackNumber=audio:trackNumber
+xmpDM\:releaseDate=audio:releaseDate
+#xmpDM:logComment
+
+# Audio specific mappings
+xmpDM\:audioSampleRate=audio:sampleRate
+xmpDM\:audioSampleType=audio:sampleType
+xmpDM\:audioChannelType=audio:channelType
+xmpDM\:audioCompressor=audio:compressor
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/MailMetadataExtractor_metadata_extract.properties b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/MailMetadataExtractor_metadata_extract.properties
index 514fa1fc..7b298ee9 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/MailMetadataExtractor_metadata_extract.properties
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/MailMetadataExtractor_metadata_extract.properties
@@ -1,14 +1,14 @@
-#
-# MailMetadataExtracter - default mapping
-#
-# author: Derek Hulley
-
-# Namespaces
-namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
-
-# Mappings
-sentDate=cm:sentdate
-originator=cm:originator, cm:author
-addressee=cm:addressee
-addressees=cm:addressees
-subjectLine=cm:subjectline, cm:description
\ No newline at end of file
+#
+# MailMetadataExtracter - default mapping
+#
+# author: Derek Hulley
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+
+# Mappings
+sentDate=cm:sentdate
+originator=cm:originator, cm:author
+addressee=cm:addressee
+addressees=cm:addressees
+subjectLine=cm:subjectline, cm:description
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/OfficeMetadataExtractor_metadata_extract.properties b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/OfficeMetadataExtractor_metadata_extract.properties
index 912279af..6ec40f29 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/OfficeMetadataExtractor_metadata_extract.properties
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/OfficeMetadataExtractor_metadata_extract.properties
@@ -1,14 +1,14 @@
-#
-# OfficeMetadataExtracter - default mapping
-#
-# author: Derek Hulley
-
-# Namespaces
-namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
-
-# Mappings
-author=cm:author
-title=cm:title
-subject=cm:description
-createDateTime=cm:created
-lastSaveDateTime=cm:modified
+#
+# OfficeMetadataExtracter - default mapping
+#
+# author: Derek Hulley
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+
+# Mappings
+author=cm:author
+title=cm:title
+subject=cm:description
+createDateTime=cm:created
+lastSaveDateTime=cm:modified
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/OpenDocumentMetadataExtractor_metadata_extract.properties b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/OpenDocumentMetadataExtractor_metadata_extract.properties
index a74de9d2..9fd4b609 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/OpenDocumentMetadataExtractor_metadata_extract.properties
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/OpenDocumentMetadataExtractor_metadata_extract.properties
@@ -1,21 +1,21 @@
-#
-# OpenDocumentMetadataExtracter - default mapping
-#
-# author: Derek Hulley
-
-# Namespaces
-namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
-
-# Mappings
-creationDate=cm:created
-creator=cm:author
-date=
-description=
-generator=
-initialCreator=
-keyword=
-language=
-printDate=
-printedBy=
-subject=cm:description
-title=cm:title
+#
+# OpenDocumentMetadataExtracter - default mapping
+#
+# author: Derek Hulley
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+
+# Mappings
+creationDate=cm:created
+creator=cm:author
+date=
+description=
+generator=
+initialCreator=
+keyword=
+language=
+printDate=
+printedBy=
+subject=cm:description
+title=cm:title
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/PdfBoxMetadataExtractor_metadata_extract.properties b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/PdfBoxMetadataExtractor_metadata_extract.properties
index c5a92bd1..ddd63094 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/PdfBoxMetadataExtractor_metadata_extract.properties
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/PdfBoxMetadataExtractor_metadata_extract.properties
@@ -1,13 +1,13 @@
-#
-# PdfBoxMetadataExtracter - default mapping
-#
-# author: Derek Hulley
-
-# Namespaces
-namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
-
-# Mappings
-author=cm:author
-title=cm:title
-subject=cm:description
-created=cm:created
+#
+# PdfBoxMetadataExtracter - default mapping
+#
+# author: Derek Hulley
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+
+# Mappings
+author=cm:author
+title=cm:title
+subject=cm:description
+created=cm:created
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/PoiMetadataExtractor_metadata_extract.properties b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/PoiMetadataExtractor_metadata_extract.properties
index 0211e61c..0dd09627 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/PoiMetadataExtractor_metadata_extract.properties
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/PoiMetadataExtractor_metadata_extract.properties
@@ -1,13 +1,13 @@
-#
-# PoiMetadataExtracter - default mapping
-#
-# author: Neil McErlean
-
-# Namespaces
-namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
-
-# Mappings
-author=cm:author
-title=cm:title
-description=cm:description
-created=cm:created
+#
+# PoiMetadataExtracter - default mapping
+#
+# author: Neil McErlean
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+
+# Mappings
+author=cm:author
+title=cm:title
+description=cm:description
+created=cm:created
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/TikaAudioMetadataExtractor_metadata_extract.properties b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/TikaAudioMetadataExtractor_metadata_extract.properties
index 542a71ce..05df2ed3 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/TikaAudioMetadataExtractor_metadata_extract.properties
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/TikaAudioMetadataExtractor_metadata_extract.properties
@@ -1,34 +1,34 @@
-#
-# TikaAudioMetadataExtracter - audio mapping
-#
-# This is used to map from the Tika audio metadata onto your
-# content model. This will be used for any Audio content
-# for which an explicit extractor isn't defined
-#
-# author: Nick Burch
-
-# Namespaces
-namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
-namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
-
-# Core mappings
-author=cm:author
-title=cm:title
-description=cm:description
-created=cm:created
-
-# Audio descriptive mappings
-xmpDM\:album=audio:album
-xmpDM\:artist=audio:artist
-xmpDM\:composer=audio:composer
-xmpDM\:engineer=audio:engineer
-xmpDM\:genre=audio:genre
-xmpDM\:trackNumber=audio:trackNumber
-xmpDM\:releaseDate=audio:releaseDate
-#xmpDM:logComment
-
-# Audio specific mappings
-xmpDM\:audioSampleRate=audio:sampleRate
-xmpDM\:audioSampleType=audio:sampleType
-xmpDM\:audioChannelType=audio:channelType
-xmpDM\:audioCompressor=audio:compressor
+#
+# TikaAudioMetadataExtracter - audio mapping
+#
+# This is used to map from the Tika audio metadata onto your
+# content model. This will be used for any Audio content
+# for which an explicit extractor isn't defined
+#
+# author: Nick Burch
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
+
+# Core mappings
+author=cm:author
+title=cm:title
+description=cm:description
+created=cm:created
+
+# Audio descriptive mappings
+xmpDM\:album=audio:album
+xmpDM\:artist=audio:artist
+xmpDM\:composer=audio:composer
+xmpDM\:engineer=audio:engineer
+xmpDM\:genre=audio:genre
+xmpDM\:trackNumber=audio:trackNumber
+xmpDM\:releaseDate=audio:releaseDate
+#xmpDM:logComment
+
+# Audio specific mappings
+xmpDM\:audioSampleRate=audio:sampleRate
+xmpDM\:audioSampleType=audio:sampleType
+xmpDM\:audioChannelType=audio:channelType
+xmpDM\:audioCompressor=audio:compressor
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/TikaAutoMetadataExtractor_metadata_extract.properties b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/TikaAutoMetadataExtractor_metadata_extract.properties
index 6982bb96..7380769e 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/TikaAutoMetadataExtractor_metadata_extract.properties
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/TikaAutoMetadataExtractor_metadata_extract.properties
@@ -1,52 +1,52 @@
-#
-# TikaAutoMetadataExtracter - default mapping
-#
-# This is used to map from the Tika and standard namespaces
-# onto your content model. This will be used for any
-# content for which an explicit extractor isn't defined,
-# by using Tika's auto-selection facilities.
-#
-# author: Nick Burch
-
-# Namespaces
-namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
-namespace.prefix.exif=http://www.alfresco.org/model/exif/1.0
-namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
-
-# Mappings
-author=cm:author
-title=cm:title
-description=cm:description
-created=cm:created
-
-geo\:lat=cm:latitude
-geo\:long=cm:longitude
-
-tiff\:ImageWidth=exif:pixelXDimension
-tiff\:ImageLength=exif:pixelYDimension
-tiff\:Make=exif:manufacturer
-tiff\:Model=exif:model
-tiff\:Software=exif:software
-tiff\:Orientation=exif:orientation
-tiff\:XResolution=exif:xResolution
-tiff\:YResolution=exif:yResolution
-tiff\:ResolutionUnit=exif:resolutionUnit
-exif\:Flash=exif:flash
-exif\:ExposureTime=exif:exposureTime
-exif\:FNumber=exif:fNumber
-exif\:FocalLength=exif:focalLength
-exif\:IsoSpeedRatings=exif:isoSpeedRatings
-exif\:DateTimeOriginal=exif:dateTimeOriginal
-
-xmpDM\:album=audio:album
-xmpDM\:artist=audio:artist
-xmpDM\:composer=audio:composer
-xmpDM\:engineer=audio:engineer
-xmpDM\:genre=audio:genre
-xmpDM\:trackNumber=audio:trackNumber
-xmpDM\:releaseDate=audio:releaseDate
-#xmpDM:logComment
-xmpDM\:audioSampleRate=audio:sampleRate
-xmpDM\:audioSampleType=audio:sampleType
-xmpDM\:audioChannelType=audio:channelType
-xmpDM\:audioCompressor=audio:compressor
+#
+# TikaAutoMetadataExtracter - default mapping
+#
+# This is used to map from the Tika and standard namespaces
+# onto your content model. This will be used for any
+# content for which an explicit extractor isn't defined,
+# by using Tika's auto-selection facilities.
+#
+# author: Nick Burch
+
+# Namespaces
+namespace.prefix.cm=http://www.alfresco.org/model/content/1.0
+namespace.prefix.exif=http://www.alfresco.org/model/exif/1.0
+namespace.prefix.audio=http://www.alfresco.org/model/audio/1.0
+
+# Mappings
+author=cm:author
+title=cm:title
+description=cm:description
+created=cm:created
+
+geo\:lat=cm:latitude
+geo\:long=cm:longitude
+
+tiff\:ImageWidth=exif:pixelXDimension
+tiff\:ImageLength=exif:pixelYDimension
+tiff\:Make=exif:manufacturer
+tiff\:Model=exif:model
+tiff\:Software=exif:software
+tiff\:Orientation=exif:orientation
+tiff\:XResolution=exif:xResolution
+tiff\:YResolution=exif:yResolution
+tiff\:ResolutionUnit=exif:resolutionUnit
+exif\:Flash=exif:flash
+exif\:ExposureTime=exif:exposureTime
+exif\:FNumber=exif:fNumber
+exif\:FocalLength=exif:focalLength
+exif\:IsoSpeedRatings=exif:isoSpeedRatings
+exif\:DateTimeOriginal=exif:dateTimeOriginal
+
+xmpDM\:album=audio:album
+xmpDM\:artist=audio:artist
+xmpDM\:composer=audio:composer
+xmpDM\:engineer=audio:engineer
+xmpDM\:genre=audio:genre
+xmpDM\:trackNumber=audio:trackNumber
+xmpDM\:releaseDate=audio:releaseDate
+#xmpDM:logComment
+xmpDM\:audioSampleRate=audio:sampleRate
+xmpDM\:audioSampleType=audio:sampleType
+xmpDM\:audioChannelType=audio:channelType
+xmpDM\:audioCompressor=audio:compressor
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml
index 076dfe54..97d1acaf 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/resources/parsers/external/config/exiftool-parser.xml
@@ -1,35 +1,35 @@
-
-TextToPDF
utility.
- *
- * @author Derek Hulley
- * @author eknizat
- */
-public class TextToPdfContentTransformer implements SelectableTransformer
-{
- private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class);
-
- private static final int UTF16_READ_AHEAD_BYTES = 16; // 8 characters including BOM if it exists
- private static final byte FE = (byte) 0xFE;
- private static final byte FF = (byte) 0xFF;
-
- public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
-
- private final PagedTextToPDF transformer;
-
- public TextToPdfContentTransformer()
- {
- transformer = new PagedTextToPDF();
- }
-
- public void setStandardFont(String fontName)
- {
- try
- {
- transformer.setFont(PagedTextToPDF.getStandardFont(fontName));
- }
- catch (Throwable e)
- {
- throw new RuntimeException(
- "Unable to set Standard Font for PDF generation: " + fontName, e);
- }
- }
-
- public void setFontSize(int fontSize)
- {
- try
- {
- transformer.setFontSize(fontSize);
- }
- catch (Throwable e)
- {
- throw new RuntimeException(
- "Unable to set Font Size for PDF generation: " + fontSize);
- }
- }
-
- @Override
- public void transform(final String sourceMimetype, final String targetMimetype, final MapTextToPDF
utility.
+ *
+ * @author Derek Hulley
+ * @author eknizat
+ */
+public class TextToPdfContentTransformer implements SelectableTransformer
+{
+ private static final Logger logger = LoggerFactory.getLogger(TextToPdfContentTransformer.class);
+
+ private static final int UTF16_READ_AHEAD_BYTES = 16; // 8 characters including BOM if it exists
+ private static final byte FE = (byte) 0xFE;
+ private static final byte FF = (byte) 0xFF;
+
+ public static final String PAGE_LIMIT = RequestParamMap.PAGE_LIMIT;
+
+ private final PagedTextToPDF transformer;
+
+ public TextToPdfContentTransformer()
+ {
+ transformer = new PagedTextToPDF();
+ }
+
+ public void setStandardFont(String fontName)
+ {
+ try
+ {
+ transformer.setFont(PagedTextToPDF.getStandardFont(fontName));
+ }
+ catch (Throwable e)
+ {
+ throw new RuntimeException(
+ "Unable to set Standard Font for PDF generation: " + fontName, e);
+ }
+ }
+
+ public void setFontSize(int fontSize)
+ {
+ try
+ {
+ transformer.setFontSize(fontSize);
+ }
+ catch (Throwable e)
+ {
+ throw new RuntimeException(
+ "Unable to set Font Size for PDF generation: " + fontSize);
+ }
+ }
+
+ @Override
+ public void transform(final String sourceMimetype, final String targetMimetype, final Map
- *
- * @param dateStr
- * @return dateStr in Iso8601 format
- */
- protected String iptcToIso8601DateString(String dateStr)
- {
- char timeSeparator = 'T';
- Matcher yearMatcher = YEAR_IPTC.matcher(dateStr);
- if (yearMatcher.find())
- {
- String year = yearMatcher.group(1);
- dateStr = yearMatcher.replaceFirst(year.replaceAll(":", "-"));
- if (dateStr.length()>year.length() && dateStr.charAt(year.length())!=timeSeparator)
- {
- dateStr = dateStr.replace(dateStr.charAt(year.length()), timeSeparator);
- }
- }
- return dateStr;
- }
-
-}
+/*
+ * #%L
+ * Alfresco Transform Core
+ * %%
+ * Copyright (C) 2005 - 2021 Alfresco Software Limited
+ * %%
+ * This file is part of the Alfresco software.
+ * -
+ * If the software was purchased under a paid Alfresco license, the terms of
+ * the paid license agreement will prevail. Otherwise, the software is
+ * provided under the following open source license terms:
+ * -
+ * Alfresco is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * -
+ * Alfresco is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ * -
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with Alfresco. If not, see
+ *
+ * @param dateStr
+ * @return dateStr in Iso8601 format
+ */
+ protected String iptcToIso8601DateString(String dateStr)
+ {
+ char timeSeparator = 'T';
+ Matcher yearMatcher = YEAR_IPTC.matcher(dateStr);
+ if (yearMatcher.find())
+ {
+ String year = yearMatcher.group(1);
+ dateStr = yearMatcher.replaceFirst(year.replaceAll(":", "-"));
+ if (dateStr.length()>year.length() && dateStr.charAt(year.length())!=timeSeparator)
+ {
+ dateStr = dateStr.replace(dateStr.charAt(year.length()), timeSeparator);
+ }
+ }
+ return dateStr;
+ }
+
+}
diff --git a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java
index 9e15731e..add3cc31 100644
--- a/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java
+++ b/alfresco-transform-tika/alfresco-transform-tika/src/main/java/org/alfresco/transformer/tika/parsers/ExifToolParser.java
@@ -1,372 +1,372 @@
-/*
- * #%L
- * Alfresco Transform Core
- * %%
- * Copyright (C) 2005 - 2021 Alfresco Software Limited
- * %%
- * This file is part of the Alfresco software.
- * -
- * If the software was purchased under a paid Alfresco license, the terms of
- * the paid license agreement will prevail. Otherwise, the software is
- * provided under the following open source license terms:
- * -
- * Alfresco is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * -
- * Alfresco is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- * -
- * You should have received a copy of the GNU Lesser General Public License
- * along with Alfresco. If not, see